diff --git a/src/providers/homoxxx.rs b/src/providers/homoxxx.rs index e15e3b7..f73fb03 100644 --- a/src/providers/homoxxx.rs +++ b/src/providers/homoxxx.rs @@ -177,15 +177,18 @@ impl HomoxxxProvider { let search_string = query.to_lowercase().trim().replace(" ", "-"); let mut video_url = format!("{}/search/{}/{}/", self.url, search_string, page); - if search_string.starts_with("@") { - let url_part = search_string - .split("@") - .collect::>() - .get(1) - .copied() - .unwrap_or_default() - .replace(":", "/"); - video_url = format!("{}/{}/", self.url, url_part); + let trimmed = query.trim().trim_start_matches('@'); + if let Some((kind, value)) = trimmed.split_once(':') { + let kind = kind.trim().to_ascii_lowercase(); + let value = value.trim().replace(' ', "-"); + if !value.is_empty() + && matches!( + kind.as_str(), + "models" | "pornstars" | "stars" | "channels" | "categories" | "tags" + ) + { + video_url = format!("{}/{}/{}/", self.url, kind, value); + } } // Check our Video Cache. If the result is younger than 1 hour, we return it. let old_items = match cache.get(&video_url) { diff --git a/src/providers/missav.rs b/src/providers/missav.rs index 18470ee..05efb84 100644 --- a/src/providers/missav.rs +++ b/src/providers/missav.rs @@ -14,6 +14,8 @@ use diesel::r2d2; use error_chain::error_chain; use futures::future::join_all; use htmlentity::entity::{ICodedDataTrait, decode}; +use std::collections::HashMap; +use std::sync::{Arc, RwLock}; use std::vec; use wreq::Version; @@ -41,15 +43,58 @@ error_chain! { #[derive(Debug, Clone)] pub struct MissavProvider { url: String, + tag_map: Arc>>, } impl MissavProvider { pub fn new() -> Self { MissavProvider { url: "https://missav.ws".to_string(), + tag_map: Arc::new(RwLock::new(HashMap::new())), } } + fn normalize_key(value: &str) -> String { + value + .trim() + .to_ascii_lowercase() + .replace(['_', '-'], " ") + .split_whitespace() + .collect::>() + .join(" ") + } + + fn humanize_slug(value: &str) -> String { + value + .trim_matches('/') + .replace('-', " ") + .split_whitespace() + .collect::>() + .join(" ") + } + + fn insert_tag_mapping(&self, key: &str, path_or_url: &str) { + let normalized = Self::normalize_key(key); + if normalized.is_empty() || path_or_url.trim().is_empty() { + return; + } + if let Ok(mut map) = self.tag_map.write() { + map.insert(normalized, path_or_url.trim().to_string()); + } + } + + fn resolve_query_url(&self, query: &str, page: u8, sort: &str) -> Option { + let normalized = Self::normalize_key(query); + let mapped = self.tag_map.read().ok()?.get(&normalized)?.clone(); + let separator = if mapped.contains('?') { "&" } else { "?" }; + let mut url = format!("{mapped}{separator}page={page}"); + if !sort.is_empty() { + url.push_str("&sort="); + url.push_str(sort); + } + Some(url) + } + fn build_channel(&self, _clientversion: ClientVersion) -> Channel { Channel { id: "missav".to_string(), @@ -248,10 +293,13 @@ impl MissavProvider { if !sort.is_empty() { sort = format!("&sort={}", sort); } - let url_str = format!( + let mut url_str = format!( "{}/{}/search/{}?page={}{}", self.url, language, search_string, page, sort ); + if let Some(mapped_url) = self.resolve_query_url(query, page, &sort.replace("&sort=", "")) { + url_str = mapped_url; + } if let Some((time, items)) = cache.get(&url_str) { if time.elapsed().unwrap_or_default().as_secs() < 3600 { @@ -386,19 +434,54 @@ impl MissavProvider { // 3. Extract Tags (Generic approach to avoid repetitive code) let mut tags = vec![]; - for (label, prefix) in [ - ("Actress:", "@actress"), - ("Actor:", "@actor"), - ("Maker:", "@maker"), - ("Genre:", "@genre"), + for (label, route_kind) in [ + ("Actress:", "actress"), + ("Actor:", "actor"), + ("Maker:", "maker"), + ("Genre:", "genre"), ] { let marker = format!("{}", label); if let Some(section) = extract(&vid, &marker, "") { - for part in section.split("class=\"text-nord13 font-medium\">").skip(1) { - if let Some(val) = part.split('<').next() { - let clean = val.trim(); - if !clean.is_empty() { - tags.push(format!("{}:{}", prefix, clean)); + for anchor in section.split("") + .nth(1) + .and_then(|value| value.split('<').next()) + .map(str::trim) + .unwrap_or_default() + .to_string(); + if !title.is_empty() { + tags.push(title.clone()); + if !href.is_empty() { + let full_url = if href.starts_with("http://") || href.starts_with("https://") { + href.clone() + } else { + format!("{}{}", self.url, href) + }; + self.insert_tag_mapping(&title, &full_url); + let slug = href + .trim_matches('/') + .rsplit('/') + .next() + .unwrap_or_default() + .to_string(); + if !slug.is_empty() { + self.insert_tag_mapping(&slug, &full_url); + self.insert_tag_mapping( + &format!("{route_kind}:{}", slug), + &full_url, + ); + self.insert_tag_mapping( + &format!("{route_kind}:{}", Self::humanize_slug(&slug)), + &full_url, + ); + } } } } diff --git a/src/providers/okxxx.rs b/src/providers/okxxx.rs index c3a77eb..8bad79d 100644 --- a/src/providers/okxxx.rs +++ b/src/providers/okxxx.rs @@ -10,7 +10,9 @@ use crate::videos::{ServerOptions, VideoItem}; use async_trait::async_trait; use error_chain::error_chain; use htmlentity::entity::{ICodedDataTrait, decode}; +use std::collections::HashMap; use std::env; +use std::sync::{Arc, RwLock}; use std::vec; use wreq::Client; use wreq_util::Emulation; @@ -31,14 +33,66 @@ error_chain! { #[derive(Debug, Clone)] pub struct OkxxxProvider { url: String, + tag_map: Arc>>, } impl OkxxxProvider { pub fn new() -> Self { OkxxxProvider { url: "https://ok.xxx".to_string(), + tag_map: Arc::new(RwLock::new(HashMap::new())), } } + fn normalize_key(value: &str) -> String { + value + .trim() + .to_ascii_lowercase() + .replace(['_', '-'], " ") + .split_whitespace() + .collect::>() + .join(" ") + } + + fn humanize_slug(value: &str) -> String { + value + .trim_matches('/') + .replace('-', " ") + .split_whitespace() + .collect::>() + .join(" ") + } + + fn insert_tag_mapping(&self, kind: &str, slug: &str, title: Option<&str>) { + let slug = slug.trim().trim_matches('/'); + if slug.is_empty() { + return; + } + let path = format!("{kind}/{slug}"); + if let Ok(mut map) = self.tag_map.write() { + map.insert(Self::normalize_key(slug), path.clone()); + let normalized_title = Self::normalize_key(title.unwrap_or(slug)); + if !normalized_title.is_empty() { + map.insert(normalized_title, path); + } + } + } + + fn resolve_query_path(&self, query: &str) -> Option { + let trimmed = query.trim().trim_start_matches('@'); + if let Some((kind, raw_value)) = trimmed.split_once(':') { + let kind = kind.trim().to_ascii_lowercase(); + let value = raw_value.trim().trim_matches('/').replace(' ', "-"); + if !value.is_empty() && matches!(kind.as_str(), "sites" | "models") { + return Some(format!("{kind}/{value}")); + } + } + let normalized = Self::normalize_key(trimmed); + if normalized.is_empty() { + return None; + } + self.tag_map.read().ok()?.get(&normalized).cloned() + } + fn build_channel(&self, _clientversion: ClientVersion) -> Channel { Channel { id: "okxxx".to_string(), @@ -177,16 +231,8 @@ impl OkxxxProvider { async fn query(&self, cache: VideoCache, page: u8, query: &str) -> Result> { let search_string = query.to_lowercase().trim().replace(" ", "-"); let mut video_url = format!("{}/search/{}/{}/", self.url, search_string, page); - - if search_string.starts_with("@") { - let url_part = search_string - .split("@") - .collect::>() - .get(1) - .copied() - .unwrap_or_default() - .replace(":", "/"); - video_url = format!("{}/{}/", self.url, url_part); + if let Some(path) = self.resolve_query_path(query) { + video_url = format!("{}/{}/{}/", self.url, path, page); } // Check our Video Cache. If the result is younger than 1 hour, we return it. let old_items = match cache.get(&video_url) { @@ -405,7 +451,8 @@ impl OkxxxProvider { .collect::>(); for tag in raw_tags { if !tag.is_empty() { - tags.push(format!("@sites:{}", tag)); + self.insert_tag_mapping("sites", &tag, None); + tags.push(Self::humanize_slug(&tag)); } } } @@ -425,7 +472,8 @@ impl OkxxxProvider { .collect::>(); for tag in raw_tags { if !tag.is_empty() { - tags.push(format!("@models:{}", tag)); + self.insert_tag_mapping("models", &tag, None); + tags.push(Self::humanize_slug(&tag)); } } } diff --git a/src/providers/omgxxx.rs b/src/providers/omgxxx.rs index d6425f7..7b67795 100644 --- a/src/providers/omgxxx.rs +++ b/src/providers/omgxxx.rs @@ -883,11 +883,35 @@ impl OmgxxxProvider { ) -> Result> { let mut search_type = "search"; let mut search_string = query.to_string().to_ascii_lowercase().trim().to_string(); + let trimmed = query.trim().trim_start_matches('@').to_ascii_lowercase(); + if let Some((kind, raw)) = trimmed.split_once(':') { + let candidate = raw.trim().replace(' ', "-"); + if !candidate.is_empty() { + match kind.trim() { + "models" | "model" | "stars" => { + search_type = "models"; + search_string = candidate; + } + "sites" | "site" => { + search_type = "sites"; + search_string = candidate; + } + "networks" | "network" => { + search_type = "networks"; + search_string = candidate; + } + _ => {} + } + } + } match self.stars.read() { Ok(stars) => { if let Some(star) = stars .iter() - .find(|s| s.title.to_ascii_lowercase() == search_string) + .find(|s| { + s.title.eq_ignore_ascii_case(&search_string) + || s.id.eq_ignore_ascii_case(&search_string) + }) { search_type = "models"; search_string = star.id.clone(); @@ -901,7 +925,10 @@ impl OmgxxxProvider { Ok(sites) => { if let Some(site) = sites .iter() - .find(|s| s.title.to_ascii_lowercase() == search_string) + .find(|s| { + s.title.eq_ignore_ascii_case(&search_string) + || s.id.eq_ignore_ascii_case(&search_string) + }) { search_type = "sites"; search_string = site.id.clone(); @@ -911,6 +938,23 @@ impl OmgxxxProvider { report_provider_error_background("omgxxx", "query.sites_read", &e.to_string()); } } + match self.networks.read() { + Ok(networks) => { + if let Some(network) = networks + .iter() + .find(|n| { + n.title.eq_ignore_ascii_case(&search_string) + || n.id.eq_ignore_ascii_case(&search_string) + }) + { + search_type = "networks"; + search_string = network.id.clone(); + } + } + Err(e) => { + report_provider_error_background("omgxxx", "query.networks_read", &e.to_string()); + } + } let mut video_url = format!("{}/{}/{}/{}/", self.url, search_type, search_string, page); video_url = video_url.replace(" ", "+"); // Check our Video Cache. If the result is younger than 1 hour, we return it. diff --git a/src/providers/perfectgirls.rs b/src/providers/perfectgirls.rs index 2eb8b71..769adf0 100644 --- a/src/providers/perfectgirls.rs +++ b/src/providers/perfectgirls.rs @@ -10,7 +10,9 @@ use crate::videos::{ServerOptions, VideoItem}; use async_trait::async_trait; use error_chain::error_chain; use htmlentity::entity::{ICodedDataTrait, decode}; +use std::collections::HashMap; use std::env; +use std::sync::{Arc, RwLock}; use std::vec; use wreq::Client; use wreq_util::Emulation; @@ -31,14 +33,66 @@ error_chain! { #[derive(Debug, Clone)] pub struct PerfectgirlsProvider { url: String, + tag_map: Arc>>, } impl PerfectgirlsProvider { pub fn new() -> Self { PerfectgirlsProvider { url: "https://www.perfectgirls.xxx".to_string(), + tag_map: Arc::new(RwLock::new(HashMap::new())), } } + fn normalize_key(value: &str) -> String { + value + .trim() + .to_ascii_lowercase() + .replace(['_', '-'], " ") + .split_whitespace() + .collect::>() + .join(" ") + } + + fn humanize_slug(value: &str) -> String { + value + .trim_matches('/') + .replace('-', " ") + .split_whitespace() + .collect::>() + .join(" ") + } + + fn insert_tag_mapping(&self, kind: &str, slug: &str, title: Option<&str>) { + let slug = slug.trim().trim_matches('/'); + if slug.is_empty() { + return; + } + let path = format!("{kind}/{slug}"); + if let Ok(mut map) = self.tag_map.write() { + map.insert(Self::normalize_key(slug), path.clone()); + let normalized_title = Self::normalize_key(title.unwrap_or(slug)); + if !normalized_title.is_empty() { + map.insert(normalized_title, path); + } + } + } + + fn resolve_query_path(&self, query: &str) -> Option { + let trimmed = query.trim().trim_start_matches('@'); + if let Some((kind, raw_value)) = trimmed.split_once(':') { + let kind = kind.trim().to_ascii_lowercase(); + let value = raw_value.trim().trim_matches('/').replace(' ', "-"); + if !value.is_empty() && matches!(kind.as_str(), "channels" | "pornstars") { + return Some(format!("{kind}/{value}")); + } + } + let normalized = Self::normalize_key(trimmed); + if normalized.is_empty() { + return None; + } + self.tag_map.read().ok()?.get(&normalized).cloned() + } + fn build_channel(&self, _clientversion: ClientVersion) -> Channel { Channel { id: "perfectgirls".to_string(), @@ -177,16 +231,8 @@ impl PerfectgirlsProvider { async fn query(&self, cache: VideoCache, page: u8, query: &str) -> Result> { let search_string = query.to_lowercase().trim().replace(" ", "-"); let mut video_url = format!("{}/search/{}/{}/", self.url, search_string, page); - - if search_string.starts_with("@") { - let url_part = search_string - .split("@") - .collect::>() - .get(1) - .copied() - .unwrap_or_default() - .replace(":", "/"); - video_url = format!("{}/{}/", self.url, url_part); + if let Some(path) = self.resolve_query_path(query) { + video_url = format!("{}/{}/{}/", self.url, path, page); } // Check our Video Cache. If the result is younger than 1 hour, we return it. let old_items = match cache.get(&video_url) { @@ -407,7 +453,8 @@ impl PerfectgirlsProvider { .collect::>(); for tag in raw_tags { if !tag.is_empty() { - tags.push(format!("@channels:{}", tag)); + self.insert_tag_mapping("channels", &tag, None); + tags.push(Self::humanize_slug(&tag)); } } } @@ -427,7 +474,8 @@ impl PerfectgirlsProvider { .collect::>(); for tag in raw_tags { if !tag.is_empty() { - tags.push(format!("@pornstars:{}", tag)); + self.insert_tag_mapping("pornstars", &tag, None); + tags.push(Self::humanize_slug(&tag)); } } } diff --git a/src/providers/perverzija.rs b/src/providers/perverzija.rs index 1743712..8eac911 100644 --- a/src/providers/perverzija.rs +++ b/src/providers/perverzija.rs @@ -13,6 +13,8 @@ use futures::future::join_all; use htmlentity::entity::{ICodedDataTrait, decode}; use serde::Deserialize; use serde::Serialize; +use std::collections::HashMap; +use std::sync::{Arc, RwLock}; use wreq::Client; use wreq::Version; use wreq_util::Emulation; @@ -40,11 +42,13 @@ struct PerverzijaDbEntry { #[derive(Debug, Clone)] pub struct PerverzijaProvider { url: String, + tag_map: Arc>>, } impl PerverzijaProvider { pub fn new() -> Self { PerverzijaProvider { url: "https://tube.perverzija.com/".to_string(), + tag_map: Arc::new(RwLock::new(HashMap::new())), } } @@ -143,6 +147,104 @@ impl PerverzijaProvider { title.trim().to_string() } + fn clip_at_first<'a>(haystack: &'a str, end_markers: &[&str]) -> &'a str { + let mut end = haystack.len(); + for marker in end_markers { + if let Some(index) = haystack.find(marker) { + end = end.min(index); + } + } + &haystack[..end] + } + + fn listing_item_scope(haystack: &str) -> &str { + Self::clip_at_first(haystack, &["", "", "
(text: &'a str, label: &str) -> &'a str { + let section = text + .split(label) + .nth(1) + .unwrap_or_default(); + Self::clip_at_first( + section, + &["", "

", "", "
, value: String) { + let normalized = value.trim(); + if normalized.is_empty() { + return; + } + if !tags + .iter() + .any(|existing| existing.eq_ignore_ascii_case(normalized)) + { + tags.push(normalized.to_string()); + } + } + + fn parse_href_values(section: &str) -> Vec { + section + .split(" String { + value + .trim() + .to_ascii_lowercase() + .replace(['_', '-'], " ") + .split_whitespace() + .collect::>() + .join(" ") + } + + fn humanize_slug(value: &str) -> String { + value + .trim_matches('/') + .replace('-', " ") + .split_whitespace() + .collect::>() + .join(" ") + } + + fn insert_tag_mapping(&self, kind: &str, slug: &str, title: Option<&str>) { + let slug = slug.trim().trim_matches('/'); + if slug.is_empty() { + return; + } + let path = format!("{kind}/{slug}"); + if let Ok(mut map) = self.tag_map.write() { + map.insert(Self::normalize_key(slug), path.clone()); + let normalized_title = Self::normalize_key(title.unwrap_or(slug)); + if !normalized_title.is_empty() { + map.insert(normalized_title, path); + } + } + } + + fn resolve_query_path(&self, query: &str) -> Option { + let trimmed = query.trim().trim_start_matches('@'); + if let Some((kind, raw_value)) = trimmed.split_once(':') { + let kind = kind.trim().to_ascii_lowercase(); + let value = raw_value.trim().trim_matches('/').replace(' ', "-"); + if !value.is_empty() && matches!(kind.as_str(), "studio" | "stars" | "tag" | "genre") + { + return Some(format!("{kind}/{value}")); + } + } + let normalized = Self::normalize_key(trimmed); + if normalized.is_empty() { + return None; + } + self.tag_map.read().ok()?.get(&normalized).cloned() + } + async fn get( &self, cache: VideoCache, @@ -212,13 +314,8 @@ impl PerverzijaProvider { url_str = format!("{}?s={}", self.url, search_string); } - if query.starts_with("@studio:") { - let studio_name = query.replace("@studio:", ""); - url_str = format!("{}studio/{}/page/{}/", self.url, studio_name, page); - query_parse = false; - } else if query.starts_with("@stars:") { - let stars_name = query.replace("@stars:", ""); - url_str = format!("{}stars/{}/page/{}/", self.url, stars_name, page); + if let Some(path) = self.resolve_query_path(query) { + url_str = format!("{}/{}/page/{}/", self.url.trim_end_matches('/'), path, page); query_parse = false; } url_str = url_str.replace("page/1/", ""); @@ -292,7 +389,8 @@ impl PerverzijaProvider { return vec![]; } - for video_segment in raw_videos { + for raw_video_segment in raw_videos { + let video_segment = Self::listing_item_scope(raw_video_segment); let title = Self::extract_title(video_segment); let embed_html_raw = Self::extract_between(video_segment, "data-embed='", "'") @@ -370,15 +468,17 @@ impl PerverzijaProvider { let studios_parts = video_segment.split("a href=\"").collect::>(); for studio in studios_parts.iter().skip(1) { if studio.starts_with("https://tube.perverzija.com/studio/") { - tags.push( - studio - .split("/\"") - .collect::>() - .get(0) - .copied() - .unwrap_or_default() - .replace("https://tube.perverzija.com/studio/", "@studio:") - .to_string(), + let slug = studio + .split("/\"") + .collect::>() + .first() + .copied() + .unwrap_or_default() + .replace("https://tube.perverzija.com/studio/", ""); + self.insert_tag_mapping("studio", &slug, None); + Self::push_unique( + &mut tags, + Self::humanize_slug(&slug), ); } } @@ -396,7 +496,8 @@ impl PerverzijaProvider { .unwrap_or_default() .to_string(); if !tag_name.is_empty() { - tags.push(format!("@stars:{}", tag_name)); + self.insert_tag_mapping("stars", &tag_name, None); + Self::push_unique(&mut tags, Self::humanize_slug(&tag_name)); } } } @@ -407,7 +508,7 @@ impl PerverzijaProvider { if token.starts_with("tag-") { let tag_name = token.split("tag-").nth(1).unwrap_or_default().to_string(); if !tag_name.is_empty() { - tags.push(tag_name.replace("-", " ").to_string()); + Self::push_unique(&mut tags, tag_name.replace("-", " ").to_string()); } } } @@ -579,88 +680,58 @@ impl PerverzijaProvider { url_str = "!".to_string() } - let mut tags: Vec = Vec::new(); // Placeholder for tags, adjust as needed + let mut tags: Vec = Vec::new(); - let studios_parts = text - .split("Studio: ") - .collect::>() - .get(1) - .copied() - .unwrap_or_default() - .split("
") - .collect::>() - .get(0) - .copied() - .unwrap_or_default() - .split(">(); - for studio in studios_parts.iter().skip(1) { - if studio.starts_with("https://tube.perverzija.com/studio/") { - tags.push( - studio - .split("/\"") - .collect::>() - .get(0) - .copied() - .unwrap_or_default() - .replace("https://tube.perverzija.com/studio/", "@studio:") - .to_string(), - ); - } - } - if text.contains("Stars: ") { - let stars_parts: Vec<&str> = text - .split("Stars: ") - .collect::>() - .get(1) - .copied() - .unwrap_or_default() - .split("") - .collect::>() - .get(0) - .copied() - .unwrap_or_default() - .split(">(); - for star in stars_parts.iter().skip(1) { - if star.starts_with("https://tube.perverzija.com/stars/") { - tags.push( - star.split("/\"") - .collect::>() - .get(0) - .copied() - .unwrap_or_default() - .replace("https://tube.perverzija.com/stars/", "@stars:") - .to_string(), - ); - } + let studios_section = Self::detail_meta_section(&text, "Studio: "); + for href in Self::parse_href_values(studios_section) { + if href.starts_with("https://tube.perverzija.com/studio/") { + let studio_slug = href + .trim_end_matches('/') + .replace("https://tube.perverzija.com/studio/", ""); + self.insert_tag_mapping("studio", &studio_slug, None); + Self::push_unique(&mut tags, Self::humanize_slug(&studio_slug)); } } - let tags_parts: Vec<&str> = text - .split("Tags: ") - .collect::>() - .get(1) - .copied() - .unwrap_or_default() - .split("") - .collect::>() - .get(0) - .copied() - .unwrap_or_default() - .split(">(); - for star in tags_parts.iter().skip(1) { - if star.starts_with("https://tube.perverzija.com/stars/") { - tags.push( - star.split("/\"") - .collect::>() - .get(0) - .copied() - .unwrap_or_default() - .replace("https://tube.perverzija.com/stars/", "@stars:") - .to_string(), - ); + let stars_section = Self::detail_meta_section(&text, "Stars: "); + for href in Self::parse_href_values(stars_section) { + if href.starts_with("https://tube.perverzija.com/stars/") { + let star_slug = href + .trim_end_matches('/') + .replace("https://tube.perverzija.com/stars/", ""); + self.insert_tag_mapping("stars", &star_slug, None); + Self::push_unique(&mut tags, Self::humanize_slug(&star_slug)); + } + } + + let tags_section = if text.contains("Tags: ") { + Self::detail_meta_section(&text, "Tags: ") + } else { + Self::detail_meta_section(&text, "Genres: ") + }; + for href in Self::parse_href_values(tags_section) { + if href.starts_with("https://tube.perverzija.com/stars/") { + let star_slug = href + .trim_end_matches('/') + .replace("https://tube.perverzija.com/stars/", ""); + self.insert_tag_mapping("stars", &star_slug, None); + Self::push_unique(&mut tags, Self::humanize_slug(&star_slug)); + continue; + } + if href.starts_with("https://tube.perverzija.com/tag/") { + let tag_slug = href + .trim_end_matches('/') + .replace("https://tube.perverzija.com/tag/", ""); + self.insert_tag_mapping("tag", &tag_slug, None); + Self::push_unique(&mut tags, Self::humanize_slug(&tag_slug)); + continue; + } + if href.starts_with("https://tube.perverzija.com/genre/") { + let genre_slug = href + .trim_end_matches('/') + .replace("https://tube.perverzija.com/genre/", ""); + self.insert_tag_mapping("genre", &genre_slug, None); + Self::push_unique(&mut tags, Self::humanize_slug(&genre_slug)); } } diff --git a/src/providers/pornhat.rs b/src/providers/pornhat.rs index 7b9f164..fe26f68 100644 --- a/src/providers/pornhat.rs +++ b/src/providers/pornhat.rs @@ -9,6 +9,8 @@ use crate::videos::{ServerOptions, VideoItem}; use async_trait::async_trait; use error_chain::error_chain; use htmlentity::entity::{ICodedDataTrait, decode}; +use std::collections::HashMap; +use std::sync::{Arc, RwLock}; use std::vec; pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata = @@ -27,14 +29,66 @@ error_chain! { #[derive(Debug, Clone)] pub struct PornhatProvider { url: String, + tag_map: Arc>>, } impl PornhatProvider { pub fn new() -> Self { PornhatProvider { url: "https://www.pornhat.com".to_string(), + tag_map: Arc::new(RwLock::new(HashMap::new())), } } + fn normalize_key(value: &str) -> String { + value + .trim() + .to_ascii_lowercase() + .replace(['_', '-'], " ") + .split_whitespace() + .collect::>() + .join(" ") + } + + fn humanize_slug(value: &str) -> String { + value + .trim_matches('/') + .replace('-', " ") + .split_whitespace() + .collect::>() + .join(" ") + } + + fn insert_tag_mapping(&self, kind: &str, slug: &str, title: Option<&str>) { + let slug = slug.trim().trim_matches('/'); + if slug.is_empty() { + return; + } + let path = format!("{kind}/{slug}"); + if let Ok(mut map) = self.tag_map.write() { + map.insert(Self::normalize_key(slug), path.clone()); + let normalized_title = Self::normalize_key(title.unwrap_or(slug)); + if !normalized_title.is_empty() { + map.insert(normalized_title, path); + } + } + } + + fn resolve_query_path(&self, query: &str) -> Option { + let trimmed = query.trim().trim_start_matches('@'); + if let Some((kind, raw_value)) = trimmed.split_once(':') { + let kind = kind.trim().to_ascii_lowercase(); + let value = raw_value.trim().trim_matches('/').replace(' ', "-"); + if !value.is_empty() && matches!(kind.as_str(), "sites" | "models") { + return Some(format!("{kind}/{value}")); + } + } + let normalized = Self::normalize_key(trimmed); + if normalized.is_empty() { + return None; + } + self.tag_map.read().ok()?.get(&normalized).cloned() + } + fn build_channel(&self, _clientversion: ClientVersion) -> Channel { Channel { id: "pornhat".to_string(), @@ -127,16 +181,8 @@ impl PornhatProvider { ) -> Result> { let search_string = query.to_lowercase().trim().replace(" ", "-"); let mut video_url = format!("{}/search/{}/{}/", self.url, search_string, page); - - if search_string.starts_with("@") { - let url_part = search_string - .split("@") - .collect::>() - .get(1) - .copied() - .unwrap_or_default() - .replace(":", "/"); - video_url = format!("{}/{}/", self.url, url_part); + if let Some(path) = self.resolve_query_path(query) { + video_url = format!("{}/{}/{}/", self.url, path, page); } // Check our Video Cache. If the result is younger than 1 hour, we return it. let old_items = match cache.get(&video_url) { @@ -296,7 +342,8 @@ impl PornhatProvider { .collect::>(); for tag in raw_tags { if !tag.is_empty() { - tags.push(format!("@sites:{}", tag)); + self.insert_tag_mapping("sites", &tag, None); + tags.push(Self::humanize_slug(&tag)); } } } @@ -316,7 +363,8 @@ impl PornhatProvider { .collect::>(); for tag in raw_tags { if !tag.is_empty() { - tags.push(format!("@models:{}", tag)); + self.insert_tag_mapping("models", &tag, None); + tags.push(Self::humanize_slug(&tag)); } } } diff --git a/src/providers/pornhub.rs b/src/providers/pornhub.rs index 8860a35..21313a9 100644 --- a/src/providers/pornhub.rs +++ b/src/providers/pornhub.rs @@ -1,23 +1,18 @@ use crate::DbPool; use crate::api::ClientVersion; use crate::providers::{ - Provider, report_provider_error, report_provider_error_background, requester_or_default, - build_proxy_url, strip_url_scheme, + Provider, build_proxy_url, report_provider_error, requester_or_default, strip_url_scheme, }; use crate::status::*; use crate::util::cache::VideoCache; use crate::util::parse_abbreviated_number; use crate::util::time::parse_time_to_seconds; -use crate::videos::{ServerOptions, VideoFormat, VideoItem}; +use crate::videos::{ServerOptions, VideoItem}; use async_trait::async_trait; -use chrono::{DateTime, NaiveDate, Utc}; use error_chain::error_chain; -use futures::stream::{self, StreamExt}; use htmlentity::entity::{ICodedDataTrait, decode}; -use regex::Regex; use scraper::{ElementRef, Html, Selector}; -use serde_json::Value; use std::collections::HashSet; use std::collections::HashMap; use std::sync::{Arc, RwLock}; @@ -32,7 +27,6 @@ pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata = const BASE_URL: &str = "https://www.pornhub.com"; const CHANNEL_ID: &str = "pornhub"; -const DETAIL_ENRICH_LIMIT: usize = 12; error_chain! { foreign_links { @@ -216,11 +210,6 @@ impl PornhubProvider { .map_err(|error| Error::from(ErrorKind::Parse(format!("selector parse failed for {value}: {error}")))) } - fn regex(value: &str) -> Result { - Regex::new(value) - .map_err(|error| Error::from(ErrorKind::Parse(format!("regex parse failed for {value}: {error}")))) - } - fn text_of(element: &ElementRef<'_>) -> String { element .text() @@ -376,7 +365,12 @@ impl PornhubProvider { let title_selector = Self::selector(".title a, .thumbnailTitle, span.title a")?; let image_selector = Self::selector("img")?; let duration_selector = Self::selector(".duration")?; - let views_selector = Self::selector(".views var")?; + let views_selector = Self::selector(".views var, .views")?; + let rating_selector = + Self::selector(".value, .rating, .ratingInfo, .percent, .ratingPercent")?; + let tag_link_selector = Self::selector( + "a[href*=\"/categories/\"], a[href*=\"/video/search\"], a[href*=\"/pornstar/\"], a[href*=\"/model/\"], a[href*=\"/channels/\"], a[href*=\"/users/\"]", + )?; let uploader_selector = Self::selector( ".videoUploaderBlock a[href], .usernameWrap a[href], .usernameWrapper a[href]", )?; @@ -438,6 +432,8 @@ impl PornhubProvider { .value() .attr("src") .or_else(|| value.value().attr("data-mediumthumb")) + .or_else(|| value.value().attr("data-path")) + .or_else(|| value.value().attr("data-src")) }) .map(|value| self.normalize_url(value)) .unwrap_or_default(); @@ -449,10 +445,20 @@ impl PornhubProvider { .and_then(|value| parse_time_to_seconds(&value)) .unwrap_or(0) as u32; - let views = card - .select(&views_selector) - .next() - .and_then(|value| parse_abbreviated_number(&Self::text_of(&value))); + let views = card.select(&views_selector).find_map(|value| { + let text = Self::text_of(&value); + parse_abbreviated_number(&text) + .or_else(|| parse_abbreviated_number(text.replace("views", "").trim())) + }); + let rating = card.select(&rating_selector).find_map(|value| { + let text = Self::text_of(&value); + let cleaned = text + .trim() + .trim_end_matches('%') + .replace(',', "") + .replace(' ', ""); + cleaned.parse::().ok() + }); let uploader_link = card.select(&uploader_selector).next(); let uploader = uploader_link @@ -486,13 +492,18 @@ impl PornhubProvider { item.uploaderId = uploader_url .as_deref() .and_then(Self::uploader_identity_from_url); + item.rating = rating; let mut tags = Vec::new(); if let Some(tag) = uploader_url .as_deref() .and_then(|url| self.query_tag_from_uploader_url(url)) { - tags.push(tag); + Self::push_unique(&mut tags, tag); + } + for tag_link in card.select(&tag_link_selector) { + let tag = Self::decode_html(&Self::text_of(&tag_link)); + Self::push_unique(&mut tags, tag); } if !tags.is_empty() { item.tags = Some(tags); @@ -549,292 +560,6 @@ impl PornhubProvider { values.push(normalized.to_string()); } - fn collect_named_links(&self, document: &Html, selector_text: &str) -> Result> { - let selector = Self::selector(selector_text)?; - let mut values = Vec::new(); - for element in document.select(&selector) { - Self::push_unique(&mut values, Self::decode_html(&Self::text_of(&element))); - } - Ok(values) - } - - fn parse_upload_date(value: &str) -> Option { - if let Ok(parsed) = DateTime::parse_from_rfc3339(value.trim()) { - return Some(parsed.timestamp() as u64); - } - - NaiveDate::parse_from_str(value.trim(), "%Y-%m-%d") - .ok() - .and_then(|date| date.and_hms_opt(0, 0, 0)) - .map(|date| DateTime::::from_naive_utc_and_offset(date, Utc).timestamp() as u64) - } - - fn json_string(value: Option<&Value>) -> Option { - value.and_then(|value| match value { - Value::String(value) => Some(value.to_string()), - Value::Number(value) => Some(value.to_string()), - _ => None, - }) - } - - fn json_u32(value: Option<&Value>) -> Option { - match value { - Some(Value::Number(value)) => value.as_u64().and_then(|value| u32::try_from(value).ok()), - Some(Value::String(value)) => value.parse::().ok(), - _ => None, - } - } - - fn extract_flashvars(&self, html: &str) -> Result> { - let regex = Self::regex(r#"(?s)var\s+flashvars_\d+\s*=\s*(\{.*?\});"#)?; - let Some(raw) = regex - .captures(html) - .and_then(|captures| captures.get(1)) - .map(|value| value.as_str()) - else { - return Ok(None); - }; - - Ok(Some(serde_json::from_str::(raw)?)) - } - - fn extract_ld_video_object(&self, document: &Html) -> Result> { - let script_selector = Self::selector("script[type=\"application/ld+json\"]")?; - for script in document.select(&script_selector) { - let raw = script.inner_html(); - let Ok(value) = serde_json::from_str::(&raw) else { - continue; - }; - - if Self::is_video_object(&value) { - return Ok(Some(value)); - } - - if let Some(array) = value.as_array() { - for entry in array { - if Self::is_video_object(entry) { - return Ok(Some(entry.clone())); - } - } - } - } - - Ok(None) - } - - fn is_video_object(value: &Value) -> bool { - value - .get("@type") - .and_then(|value| value.as_str()) - .is_some_and(|value| value.eq_ignore_ascii_case("VideoObject")) - } - - fn build_formats_from_flashvars(&self, flashvars: &Value) -> Vec { - let mut entries = flashvars - .get("mediaDefinitions") - .and_then(|value| value.as_array()) - .into_iter() - .flatten() - .filter_map(|entry| { - let format = entry - .get("format") - .and_then(|value| value.as_str()) - .unwrap_or_default() - .to_ascii_lowercase(); - if format != "hls" { - return None; - } - - let url = entry - .get("videoUrl") - .and_then(|value| value.as_str()) - .map(|value| self.normalize_url(value)) - .filter(|value| !value.is_empty())?; - - let quality = entry - .get("quality") - .and_then(|value| value.as_str()) - .unwrap_or("auto"); - let label = match quality { - "auto" => "auto".to_string(), - value if value.ends_with('p') => value.to_string(), - value => format!("{value}p"), - }; - - let rank = if label == "auto" { - 0 - } else { - label - .trim_end_matches('p') - .parse::() - .unwrap_or(0) - }; - - Some((rank, label, url)) - }) - .collect::>(); - - entries.sort_by_key(|(rank, _, _)| *rank); - entries.dedup_by(|a, b| a.2 == b.2); - - entries - .into_iter() - .map(|(_, label, url)| { - VideoFormat::new(url, label.clone(), "m3u8".to_string()) - .format_id(label.clone()) - .format_note(label) - }) - .collect() - } - - fn apply_detail_video(&self, mut item: VideoItem, html: &str) -> Result { - let document = Html::parse_document(html); - - if let Some(flashvars) = self.extract_flashvars(html)? { - if let Some(title) = Self::json_string(flashvars.get("video_title")) { - let decoded = Self::decode_html(&title); - if !decoded.is_empty() { - item.title = decoded; - } - } - - if let Some(thumb) = Self::json_string(flashvars.get("image_url")) { - let normalized = self.normalize_url(&thumb); - if !normalized.is_empty() { - item.thumb = normalized; - } - } - - if let Some(duration) = Self::json_u32(flashvars.get("video_duration")) { - item.duration = duration; - } - - if let Some(link_url) = Self::json_string(flashvars.get("link_url")) { - let normalized = self.normalize_url(&link_url); - if !normalized.is_empty() { - item.url = normalized; - } - } - } - - if let Some(ld_video) = self.extract_ld_video_object(&document)? { - if let Some(thumb) = ld_video - .get("thumbnailUrl") - .and_then(|value| match value { - Value::String(value) => Some(value.to_string()), - Value::Array(values) => values - .iter() - .find_map(|entry| entry.as_str().map(ToOwned::to_owned)), - _ => None, - }) - { - let normalized = self.normalize_url(&thumb); - if !normalized.is_empty() { - item.thumb = normalized; - } - } - - if let Some(uploaded_at) = ld_video - .get("uploadDate") - .and_then(|value| value.as_str()) - .and_then(Self::parse_upload_date) - { - item.uploadedAt = Some(uploaded_at); - } - - if item.views.is_none() { - item.views = Self::json_string(ld_video.get("interactionCount")) - .and_then(|value| value.parse::().ok()); - } - - if item.uploader.is_none() { - item.uploader = ld_video - .get("author") - .and_then(|value| match value { - Value::String(value) => Some(value.to_string()), - Value::Object(values) => values - .get("name") - .and_then(|value| value.as_str()) - .map(ToOwned::to_owned), - _ => None, - }) - .filter(|value| !value.trim().is_empty()); - } - } - - let mut tags = item.tags.clone().unwrap_or_default(); - for value in self.collect_named_links( - &document, - ".categoriesWrapper a.item, .categoriesWrapper a[href*=\"/categories/\"]", - )? { - Self::push_unique(&mut tags, value); - } - for value in self.collect_named_links( - &document, - ".tagsWrapper a.item, .tagsWrapper a[href*=\"/video/search\"]", - )? { - Self::push_unique(&mut tags, value); - } - for value in self.collect_named_links( - &document, - ".pornstarsWrapper a.item, .pornstarsWrapper a[href*=\"/pornstar/\"], a[href*=\"/pornstar/\"]", - )? { - Self::push_unique(&mut tags, value); - } - for value in self.collect_named_links( - &document, - ".modelsWrapper a.item, .modelsWrapper a[href*=\"/model/\"], a[href*=\"/model/\"]", - )? { - Self::push_unique(&mut tags, value); - } - if !tags.is_empty() { - item.tags = Some(tags); - } - - Ok(item) - } - - async fn enrich_listing_items(&self, items: Vec, options: &ServerOptions) -> Vec { - let requester = requester_or_default(options, CHANNEL_ID, "enrich_listing_items.requester"); - let mut enriched = stream::iter(items.into_iter().enumerate().map(|(index, item)| { - let provider = self.clone(); - let requester = requester.clone(); - async move { - if index >= DETAIL_ENRICH_LIMIT || item.url.is_empty() { - return (index, item); - } - - let fallback = item.clone(); - let enriched = match provider.fetch_detail(item, requester).await { - Ok(value) => value, - Err(error) => { - report_provider_error_background( - CHANNEL_ID, - "enrich_listing_items.detail", - &format!("url={}; error={error}", fallback.url), - ); - fallback - } - }; - (index, enriched) - } - })) - .buffer_unordered(4) - .collect::>() - .await; - - enriched.sort_by_key(|(index, _)| *index); - enriched.into_iter().map(|(_, item)| item).collect() - } - - async fn fetch_detail(&self, item: VideoItem, mut requester: crate::util::requester::Requester) -> Result { - let html = requester - .get(&item.url, None) - .await - .map_err(|error| ErrorKind::Parse(format!("detail request failed: {error}")))?; - self.apply_detail_video(item, &html) - } - async fn fetch_listing( &self, cache: VideoCache, @@ -843,6 +568,10 @@ impl PornhubProvider { query: Option<&str>, options: ServerOptions, ) -> Result> { + if query.is_some() && self.tag_map.read().unwrap().is_empty() { + let _ = Self::load_tags(&self.url, Arc::clone(&self.tag_map)).await; + } + let (video_url, scope) = self.build_listing_request(page, sort, query); let old_items = match cache.get(&video_url) { Some((time, items)) if time.elapsed().unwrap_or_default().as_secs() < 60 * 5 => { @@ -883,20 +612,14 @@ impl PornhubProvider { return Ok(old_items); } - let mut items = self.enrich_listing_items(items, &options).await; + let mut items = items; - // Rewrite thumbs and previews to use the proxy when appropriate + // Rewrite thumbs to use the page-driven thumb proxy when appropriate for item in items.iter_mut() { - let proxied = self.proxied_thumb(&options, &item.thumb); + let proxied = self.proxied_thumb(&options, &item.url); if !proxied.is_empty() { item.thumb = proxied; } - if let Some(prev) = item.preview.clone() { - let proxied_prev = self.proxied_thumb(&options, &prev); - if !proxied_prev.is_empty() { - item.preview = Some(proxied_prev); - } - } } cache.remove(&video_url); @@ -906,28 +629,33 @@ impl PornhubProvider { } impl PornhubProvider { - fn proxied_thumb(&self, options: &ServerOptions, thumb: &str) -> String { - if thumb.is_empty() { + fn proxied_thumb(&self, options: &ServerOptions, page_url: &str) -> String { + if page_url.is_empty() { return String::new(); } - if !PornhubThumbPolicy::is_allowed_thumb_url(thumb) { + if !PornhubThumbPolicy::is_allowed_video_page_url(page_url) { return String::new(); } - build_proxy_url(options, "pornhub-thumb", &strip_url_scheme(thumb)) + build_proxy_url(options, "pornhub-thumb", &strip_url_scheme(page_url)) } } struct PornhubThumbPolicy; impl PornhubThumbPolicy { - fn is_allowed_thumb_url(url: &str) -> bool { + fn is_allowed_video_page_url(url: &str) -> bool { let Some(url) = Url::parse(url).ok() else { return false; }; if url.scheme() != "https" { return false; } let Some(host) = url.host_str() else { return false; }; - // Only allow the specific Pornhub CDN host used for thumbnails - host.eq_ignore_ascii_case("pix-cdn77.phncdn.com") + if !host.eq_ignore_ascii_case("pornhub.com") + && !host.eq_ignore_ascii_case("www.pornhub.com") + && !host.ends_with(".pornhub.com") + { + return false; + } + url.path().starts_with("/view_video.php") || url.path().starts_with("/video/") } } @@ -986,6 +714,33 @@ mod tests { assert!(provider.parse_query_target("teacher").is_none()); } + #[test] + fn resolves_query_from_tag_map_by_id_or_title() { + let provider = PornhubProvider::new(); + { + let mut map = provider.tag_map.write().unwrap(); + let info = TagInfo { + kind: QueryTargetKind::Channel, + slug: "mature-4k".to_string(), + title: "Mature 4K".to_string(), + }; + map.insert("mature-4k".to_string(), info.clone()); + map.insert("mature 4k".to_string(), info); + } + + let by_id = provider + .parse_query_target("mature-4k") + .expect("id lookup should resolve"); + assert!(matches!(by_id.kind, QueryTargetKind::Channel)); + assert_eq!(by_id.slug, "mature-4k"); + + let by_title = provider + .parse_query_target("Mature 4K") + .expect("title lookup should resolve"); + assert!(matches!(by_title.kind, QueryTargetKind::Channel)); + assert_eq!(by_title.slug, "mature-4k"); + } + #[test] fn parses_browse_listing_cards() { let provider = PornhubProvider::new(); @@ -1030,66 +785,47 @@ mod tests { assert!(items[0] .tags .as_ref() - .is_some_and(|values| values.iter().any(|value| value == "@model:honeycore"))); + .is_some_and(|values| values.iter().any(|value| value.eq_ignore_ascii_case("honeycore")))); } #[test] - fn applies_detail_video_metadata() { + fn parses_listing_metadata_without_detail_fetch() { let provider = PornhubProvider::new(); - let item = VideoItem::new( - "69cfa159b1377".to_string(), - "placeholder".to_string(), - "https://www.pornhub.com/view_video.php?viewkey=69cfa159b1377".to_string(), - CHANNEL_ID.to_string(), - "https://example.com/thumb.jpg".to_string(), - 0, - ); let html = r#" - - - -
- Maid -
+
    +
  • + + +
    12:18
    +
    + 199K views + 95% +
    + Anal + Jane Doe +
  • +
"#; - let item = provider - .apply_detail_video(item, html) - .expect("detail page should enrich item"); - assert_eq!(item.title, "Brazzers Detail Title"); - assert_eq!(item.thumb, "https://example.com/ld-thumb.jpg"); - assert_eq!(item.duration, 930); - assert_eq!(item.views, Some(5700)); - assert_eq!(item.uploader.as_deref(), Some("Brazzers")); - assert!(item.uploadedAt.is_some()); - assert_eq!(item.formats.as_ref().map(|values| values.len()), Some(2)); - assert!(item.tags.as_ref().is_some_and(|values| values - .iter() - .any(|value| value == "Big Tits"))); - assert!(item.tags.as_ref().is_some_and(|values| values - .iter() - .any(|value| value == "Maid"))); + let items = provider + .parse_listing_page(html, ListingScope::Browse) + .expect("browse listing should parse"); + + assert_eq!(items.len(), 1); + assert_eq!(items[0].thumb, "https://example.com/thumb.jpg"); + assert_eq!(items[0].preview.as_deref(), Some("https://example.com/preview.webm")); + assert_eq!(items[0].views, Some(199000)); + assert_eq!(items[0].rating, Some(95.0)); + assert!(items[0] + .tags + .as_ref() + .is_some_and(|values| values.iter().any(|value| value == "Anal"))); + assert!(items[0] + .tags + .as_ref() + .is_some_and(|values| values.iter().any(|value| value == "Jane Doe"))); } } diff --git a/src/proxies/mod.rs b/src/proxies/mod.rs index 332fcb8..a63edfe 100644 --- a/src/proxies/mod.rs +++ b/src/proxies/mod.rs @@ -10,7 +10,6 @@ use crate::{proxies::sxyprn::SxyprnProxy, util::requester::Requester}; pub mod doodstream; pub mod hanimecdn; pub mod hqpornerthumb; -pub mod pornhubthumb; pub mod javtiful; pub mod noodlemagazine; pub mod pimpbunny; @@ -18,6 +17,7 @@ pub mod pimpbunnythumb; pub mod porndish; pub mod porndishthumb; pub mod pornhd3x; +pub mod pornhubthumb; pub mod shooshtime; pub mod spankbang; pub mod sxyprn; diff --git a/src/proxies/pornhubthumb.rs b/src/proxies/pornhubthumb.rs index f30dbe7..5e5b583 100644 --- a/src/proxies/pornhubthumb.rs +++ b/src/proxies/pornhubthumb.rs @@ -1,51 +1,220 @@ -use ntex::http::header::{CONTENT_LENGTH, CONTENT_TYPE}; -use ntex::{ - http::Response, - web::{self, HttpRequest, error}, -}; +use ntex::web::{self, HttpRequest}; +use regex::Regex; +use scraper::{Html, Selector}; +use url::Url; use crate::util::requester::Requester; +const PORNHUB_ROOT: &str = "https://www.pornhub.com/"; + +fn endpoint_to_page_url(req: &HttpRequest) -> String { + let endpoint = req.match_info().query("endpoint").trim_start_matches('/'); + let mut page_url = if endpoint.starts_with("http://") || endpoint.starts_with("https://") { + endpoint.to_string() + } else { + format!("https://{endpoint}") + }; + + let query = req.query_string(); + if !query.is_empty() && !page_url.contains('?') { + page_url.push('?'); + page_url.push_str(query); + } + + page_url +} + +fn is_allowed_video_page_url(url: &str) -> bool { + let Some(url) = Url::parse(url).ok() else { + return false; + }; + if url.scheme() != "https" { + return false; + } + let Some(host) = url.host_str() else { + return false; + }; + if host != "pornhub.com" && host != "www.pornhub.com" && !host.ends_with(".pornhub.com") { + return false; + } + url.path().starts_with("/view_video.php") || url.path().starts_with("/video/") +} + +fn normalize_candidate_url(candidate: &str, page_url: &Url) -> Option { + if candidate.is_empty() { + return None; + } + if candidate.starts_with("//") { + return Some(format!("https:{candidate}")); + } + if candidate.starts_with("https://") || candidate.starts_with("http://") { + return Some(candidate.to_string()); + } + if candidate.starts_with('/') { + let host = page_url.host_str()?; + return Some(format!("{}://{}{}", page_url.scheme(), host, candidate)); + } + None +} + +fn is_allowed_thumb_url(url: &str) -> bool { + let Some(url) = Url::parse(url).ok() else { + return false; + }; + if url.scheme() != "https" { + return false; + } + let Some(host) = url.host_str() else { + return false; + }; + let allowed_host = host == "pornhub.com" + || host == "www.pornhub.com" + || host.ends_with(".pornhub.com") + || host.ends_with(".phncdn.com"); + if !allowed_host { + return false; + } + let path = url.path().to_ascii_lowercase(); + [".jpg", ".jpeg", ".png", ".webp", ".avif"] + .iter() + .any(|ext| path.ends_with(ext)) +} + +fn decode_js_string(value: &str) -> String { + value + .replace("\\/", "/") + .replace("\\u002F", "/") + .replace("\\u003A", ":") +} + +fn find_thumb_in_html(html: &str, page_url: &Url) -> Option { + let document = Html::parse_document(html); + let selector = Selector::parse( + "meta[property=\"og:image\"], meta[name=\"twitter:image\"], meta[itemprop=\"thumbnailUrl\"]", + ) + .ok()?; + + for meta in document.select(&selector) { + let value = meta.value().attr("content").unwrap_or_default().trim(); + if let Some(candidate) = normalize_candidate_url(value, page_url) { + if is_allowed_thumb_url(&candidate) { + return Some(candidate); + } + } + } + + let image_url_re = Regex::new(r#""image_url"\s*:\s*"([^"]+)""#).ok()?; + if let Some(captures) = image_url_re.captures(html) { + let raw = captures + .get(1) + .map(|value| value.as_str()) + .unwrap_or_default(); + let decoded = decode_js_string(raw); + if let Some(candidate) = normalize_candidate_url(&decoded, page_url) { + if is_allowed_thumb_url(&candidate) { + return Some(candidate); + } + } + } + + None +} + pub async fn get_image( req: HttpRequest, requester: web::types::State, ) -> Result { - let endpoint = req.match_info().query("endpoint").to_string(); - let image_url = if endpoint.starts_with("http://") || endpoint.starts_with("https://") { - endpoint - } else { - format!("https://{}", endpoint.trim_start_matches('/')) - }; + let page_url = endpoint_to_page_url(&req); + if !is_allowed_video_page_url(&page_url) { + return Ok(web::HttpResponse::BadRequest().finish()); + } - let upstream = match requester - .get_ref() - .clone() - .get_raw_with_headers( - image_url.as_str(), - vec![("Referer".to_string(), "https://www.pornhub.com/".to_string())], + let mut requester = requester.get_ref().clone(); + let html = match requester + .get_with_headers( + page_url.as_str(), + vec![("Referer".to_string(), PORNHUB_ROOT.to_string())], + None, ) .await { - Ok(response) => response, + Ok(value) => value, Err(_) => return Ok(web::HttpResponse::NotFound().finish()), }; - let status = upstream.status(); - let headers = upstream.headers().clone(); - let bytes = upstream.bytes().await.map_err(error::ErrorBadGateway)?; + let parsed_page_url = match Url::parse(&page_url) { + Ok(value) => value, + Err(_) => return Ok(web::HttpResponse::BadRequest().finish()), + }; - let mut resp = Response::build(status); + let Some(image_url) = find_thumb_in_html(&html, &parsed_page_url) else { + return Ok(web::HttpResponse::NotFound().finish()); + }; - if let Some(ct) = headers.get(CONTENT_TYPE) { - if let Ok(ct_str) = ct.to_str() { - resp.set_header(CONTENT_TYPE, ct_str); - } - } - if let Some(cl) = headers.get(CONTENT_LENGTH) { - if let Ok(cl_str) = cl.to_str() { - resp.set_header(CONTENT_LENGTH, cl_str); - } - } - - Ok(resp.body(bytes.to_vec())) + Ok(web::HttpResponse::Found() + .header("Location", image_url) + .finish()) +} + +#[cfg(test)] +mod tests { + use super::{ + decode_js_string, is_allowed_thumb_url, is_allowed_video_page_url, normalize_candidate_url, + }; + use url::Url; + + #[test] + fn validates_allowed_video_pages() { + assert!(is_allowed_video_page_url( + "https://www.pornhub.com/view_video.php?viewkey=abc123" + )); + assert!(is_allowed_video_page_url( + "https://www.pornhub.com/video/search?search=test" + )); + assert!(!is_allowed_video_page_url( + "https://example.com/view_video.php?viewkey=abc123" + )); + assert!(!is_allowed_video_page_url( + "http://www.pornhub.com/view_video.php?viewkey=abc123" + )); + } + + #[test] + fn validates_allowed_thumb_hosts_and_extensions() { + assert!(is_allowed_thumb_url( + "https://pix-cdn77.phncdn.com/videos/2026/04/01/1/(m=eafTGgaaaa)(mh=abc123)1.jpg" + )); + assert!(is_allowed_thumb_url( + "https://www.pornhub.com/webmasters/thumb.webp" + )); + assert!(!is_allowed_thumb_url("https://example.com/thumb.jpg")); + assert!(!is_allowed_thumb_url( + "https://pix-cdn77.phncdn.com/videos/2026/04/01/1/manifest.m3u8" + )); + } + + #[test] + fn normalizes_protocol_relative_and_root_relative_urls() { + let page_url = Url::parse("https://www.pornhub.com/view_video.php?viewkey=abc").unwrap(); + let protocol_relative = + normalize_candidate_url("//pix-cdn77.phncdn.com/thumb.jpg", &page_url); + assert_eq!( + protocol_relative.as_deref(), + Some("https://pix-cdn77.phncdn.com/thumb.jpg") + ); + + let root_relative = normalize_candidate_url("/assets/thumb.jpg", &page_url); + assert_eq!( + root_relative.as_deref(), + Some("https://www.pornhub.com/assets/thumb.jpg") + ); + } + + #[test] + fn decodes_js_escaped_urls() { + assert_eq!( + decode_js_string(r#"https:\/\/pix-cdn77.phncdn.com\/thumb.jpg"#), + "https://pix-cdn77.phncdn.com/thumb.jpg" + ); + } }