From fbe04fc752af38430c210188b1588a5426aa775d Mon Sep 17 00:00:00 2001 From: Simon Date: Sun, 22 Mar 2026 15:56:25 +0000 Subject: [PATCH] upgrades --- .gitignore | 3 +- Cargo.toml | 1 + build.rs | 24 +- src/main.rs | 5 +- src/providers/arabpornxxx.rs | 1171 +++++++++++++++++++++++++++++ src/providers/pornhd3x.rs | 1298 +++++++++++++++++++++++++++++++++ src/providers/pornmz.rs | 1241 +++++++++++++++++++++++++++++++ src/providers/sextb.rs | 1210 ++++++++++++++++++++++++++++++ src/proxies/noodlemagazine.rs | 9 +- src/proxies/pimpbunny.rs | 11 +- src/proxies/pimpbunnythumb.rs | 7 +- src/util/requester.rs | 6 +- 12 files changed, 4960 insertions(+), 26 deletions(-) create mode 100644 src/providers/arabpornxxx.rs create mode 100644 src/providers/pornhd3x.rs create mode 100644 src/providers/pornmz.rs create mode 100644 src/providers/sextb.rs diff --git a/.gitignore b/.gitignore index 0056e0c..98cf6c4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ # will have compiled files and executables debug/ target/ -.testing/ +.*/ # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html @@ -17,4 +17,3 @@ Cargo.lock *.db migrations/.keep -.vscode diff --git a/Cargo.toml b/Cargo.toml index 7e035b7..4ebf0a0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,6 +36,7 @@ dashmap = "6.1.0" lru = "0.16.3" rand = "0.10.0" chrono = "0.4.44" +md5 = "0.8.0" [lints.rust] unexpected_cfgs = "allow" diff --git a/build.rs b/build.rs index 6fa6e80..692239b 100644 --- a/build.rs +++ b/build.rs @@ -29,6 +29,11 @@ const PROVIDERS: &[ProviderDef] = &[ module: "pornhub", ty: "PornhubProvider", }, + ProviderDef { + id: "pornhd3x", + module: "pornhd3x", + ty: "Pornhd3xProvider", + }, ProviderDef { id: "spankbang", module: "spankbang", @@ -84,6 +89,11 @@ const PROVIDERS: &[ProviderDef] = &[ module: "yesporn", ty: "YespornProvider", }, + ProviderDef { + id: "arabpornxxx", + module: "arabpornxxx", + ty: "ArabpornxxxProvider", + }, ProviderDef { id: "sxyprn", module: "sxyprn", @@ -109,6 +119,11 @@ const PROVIDERS: &[ProviderDef] = &[ module: "porn4fans", ty: "Porn4fansProvider", }, + ProviderDef { + id: "pornmz", + module: "pornmz", + ty: "PornmzProvider", + }, ProviderDef { id: "porndish", module: "porndish", @@ -214,6 +229,11 @@ const PROVIDERS: &[ProviderDef] = &[ module: "hsex", ty: "HsexProvider", }, + ProviderDef { + id: "sextb", + module: "sextb", + ty: "SextbProvider", + }, ProviderDef { id: "hentaihaven", module: "hentaihaven", @@ -236,9 +256,7 @@ fn main() { .map(|provider| format!("\"{}\"", provider.id)) .collect::>() .join(", "); - println!( - "cargo:rustc-check-cfg=cfg(hottub_provider, values({provider_cfg_values}))" - ); + println!("cargo:rustc-check-cfg=cfg(hottub_provider, values({provider_cfg_values}))"); let selected = env::var("HOT_TUB_PROVIDER") .or_else(|_| env::var("HOTTUB_PROVIDER")) diff --git a/src/main.rs b/src/main.rs index b809944..7392e8b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -59,10 +59,7 @@ async fn main() -> std::io::Result<()> { let mut requester = util::requester::Requester::new(); let proxy_enabled = env::var("PROXY").unwrap_or("0".to_string()) != "0".to_string(); requester.set_proxy(proxy_enabled); - crate::flow_debug!( - "requester initialized proxy_enabled={}", - proxy_enabled - ); + crate::flow_debug!("requester initialized proxy_enabled={}", proxy_enabled); let cache: util::cache::VideoCache = crate::util::cache::VideoCache::new() .max_size(100_000) diff --git a/src/providers/arabpornxxx.rs b/src/providers/arabpornxxx.rs new file mode 100644 index 0000000..a380205 --- /dev/null +++ b/src/providers/arabpornxxx.rs @@ -0,0 +1,1171 @@ +use crate::DbPool; +use crate::api::ClientVersion; +use crate::providers::{ + Provider, report_provider_error, report_provider_error_background, requester_or_default, +}; +use crate::status::*; +use crate::util::cache::VideoCache; +use crate::util::parse_abbreviated_number; +use crate::util::requester::Requester; +use crate::util::time::parse_time_to_seconds; +use crate::videos::{ServerOptions, VideoItem}; +use async_trait::async_trait; +use chrono::{DateTime, NaiveDate, Utc}; +use error_chain::error_chain; +use futures::stream::{self, StreamExt}; +use htmlentity::entity::{ICodedDataTrait, decode}; +use regex::Regex; +use scraper::{ElementRef, Html, Selector}; +use std::sync::{Arc, RwLock}; +use std::{thread, vec}; + +pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata = + crate::providers::ProviderChannelMetadata { + group_id: "mainstream-tube", + tags: &["arab", "hijab", "niche"], + }; + +error_chain! { + foreign_links { + Io(std::io::Error); + } + errors { + Parse(msg: String) { + description("parse error") + display("parse error: {}", msg) + } + } +} + +const BASE_URL: &str = "https://arabporn.xxx"; +const CHANNEL_ID: &str = "arabpornxxx"; + +#[derive(Debug, Clone)] +pub struct ArabpornxxxProvider { + url: String, + categories: Arc>>, + tags: Arc>>, + uploaders: Arc>>, +} + +#[derive(Debug, Clone)] +enum Target { + Latest, + Archive(String), + Search(String), +} + +impl ArabpornxxxProvider { + pub fn new() -> Self { + let provider = Self { + url: BASE_URL.to_string(), + categories: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + tags: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + uploaders: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + }; + provider.spawn_initial_load(); + provider + } + + fn spawn_initial_load(&self) { + let url = self.url.clone(); + let categories = Arc::clone(&self.categories); + let tags = Arc::clone(&self.tags); + let uploaders = Arc::clone(&self.uploaders); + + thread::spawn(move || { + let runtime = match tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + { + Ok(runtime) => runtime, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "spawn_initial_load.runtime_build", + &error.to_string(), + ); + return; + } + }; + + runtime.block_on(async move { + if let Err(error) = + Self::load_categories(&url, Arc::clone(&categories), Arc::clone(&uploaders)) + .await + { + report_provider_error_background( + CHANNEL_ID, + "load_categories", + &error.to_string(), + ); + } + if let Err(error) = Self::load_tags(&url, Arc::clone(&tags)).await { + report_provider_error_background(CHANNEL_ID, "load_tags", &error.to_string()); + } + }); + }); + } + + fn build_channel(&self, _clientversion: ClientVersion) -> Channel { + let categories = self + .categories + .read() + .map(|value| value.clone()) + .unwrap_or_default(); + let tags = self.tags.read().map(|value| value.clone()).unwrap_or_default(); + let uploaders = self + .uploaders + .read() + .map(|value| value.clone()) + .unwrap_or_default(); + + Channel { + id: CHANNEL_ID.to_string(), + name: "ArabPorn.xxx".to_string(), + description: + "ArabPorn.xxx videos with latest, popular, rated, category, tag, and source archives." + .to_string(), + premium: false, + favicon: "https://www.google.com/s2/favicons?sz=64&domain=arabporn.xxx".to_string(), + status: "active".to_string(), + categories: categories.iter().map(|value| value.title.clone()).collect(), + options: vec![ + ChannelOption { + id: "sort".to_string(), + title: "Sort".to_string(), + description: "Browse ArabPorn.xxx by archive order.".to_string(), + systemImage: "list.number".to_string(), + colorName: "blue".to_string(), + options: vec![ + FilterOption { + id: "new".to_string(), + title: "Latest".to_string(), + }, + FilterOption { + id: "popular".to_string(), + title: "Most Viewed".to_string(), + }, + FilterOption { + id: "rated".to_string(), + title: "Top Rated".to_string(), + }, + FilterOption { + id: "longest".to_string(), + title: "Longest".to_string(), + }, + FilterOption { + id: "commented".to_string(), + title: "Most Commented".to_string(), + }, + FilterOption { + id: "recommended".to_string(), + title: "Most Favorited".to_string(), + }, + ], + multiSelect: false, + }, + ChannelOption { + id: "categories".to_string(), + title: "Categories".to_string(), + description: "Browse an ArabPorn.xxx category archive.".to_string(), + systemImage: "square.grid.2x2".to_string(), + colorName: "orange".to_string(), + options: categories, + multiSelect: false, + }, + ChannelOption { + id: "filter".to_string(), + title: "Tags".to_string(), + description: "Browse an ArabPorn.xxx tag archive.".to_string(), + systemImage: "tag.fill".to_string(), + colorName: "green".to_string(), + options: tags, + multiSelect: false, + }, + ChannelOption { + id: "sites".to_string(), + title: "Uploaders".to_string(), + description: "Browse an ArabPorn.xxx source or uploader archive.".to_string(), + systemImage: "person.crop.square".to_string(), + colorName: "purple".to_string(), + options: uploaders, + multiSelect: false, + }, + ], + nsfw: true, + cacheDuration: Some(1800), + } + } + + fn selector(value: &str) -> Result { + Selector::parse(value) + .map_err(|error| Error::from(format!("selector `{value}` parse failed: {error}"))) + } + + fn regex(value: &str) -> Result { + Regex::new(value).map_err(|error| Error::from(format!("regex `{value}` failed: {error}"))) + } + + fn text_of(element: &ElementRef<'_>) -> String { + element + .text() + .collect::>() + .join(" ") + .split_whitespace() + .collect::>() + .join(" ") + .trim() + .to_string() + } + + fn decode_html(text: &str) -> String { + decode(text.as_bytes()) + .to_string() + .unwrap_or_else(|_| text.to_string()) + } + + fn clean_title(value: &str) -> String { + Self::decode_html(value) + .split_whitespace() + .collect::>() + .join(" ") + .trim() + .to_string() + } + + fn normalize_title(value: &str) -> String { + value + .to_ascii_lowercase() + .chars() + .map(|value| { + if value.is_ascii_alphanumeric() { + value + } else { + ' ' + } + }) + .collect::() + .split_whitespace() + .collect::>() + .join(" ") + } + + fn normalize_url(&self, value: &str) -> String { + let value = value.trim(); + if value.is_empty() { + return String::new(); + } + if value.starts_with("http://") || value.starts_with("https://") { + return value.to_string(); + } + if value.starts_with("//") { + return format!("https:{value}"); + } + if value.starts_with('/') { + return format!("{}{}", self.url, value); + } + format!("{}/{}", self.url, value.trim_start_matches("./")) + } + + fn parse_duration(text: &str) -> u32 { + parse_time_to_seconds(text) + .and_then(|value| u32::try_from(value).ok()) + .unwrap_or(0) + } + + fn parse_views(text: &str) -> Option { + let cleaned = text + .replace("views", "") + .replace("view", "") + .replace(' ', "") + .trim() + .to_string(); + parse_abbreviated_number(&cleaned) + } + + fn parse_percent(text: &str) -> Option { + text.trim() + .trim_end_matches('%') + .trim() + .parse::() + .ok() + } + + fn parse_upload_date_timestamp(html: &str) -> Option { + let regex = Self::regex(r#""uploadDate"\s*:\s*"([^"]+)""#).ok()?; + let value = regex.captures(html)?.get(1)?.as_str(); + DateTime::parse_from_rfc3339(value) + .map(|value| value.with_timezone(&Utc).timestamp() as u64) + .ok() + .or_else(|| { + NaiveDate::parse_from_str(value, "%Y-%m-%d") + .ok() + .and_then(|value| value.and_hms_opt(0, 0, 0)) + .map(|value| value.and_utc().timestamp() as u64) + }) + } + + fn parse_iso8601_duration_seconds(value: &str) -> Option { + let regex = Self::regex(r#"^PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?$"#).ok()?; + let captures = regex.captures(value)?; + let hours = captures + .get(1) + .and_then(|value| value.as_str().parse::().ok()) + .unwrap_or(0); + let minutes = captures + .get(2) + .and_then(|value| value.as_str().parse::().ok()) + .unwrap_or(0); + let seconds = captures + .get(3) + .and_then(|value| value.as_str().parse::().ok()) + .unwrap_or(0); + Some( + hours + .saturating_mul(3600) + .saturating_add(minutes.saturating_mul(60)) + .saturating_add(seconds), + ) + } + + fn extract_json_string(html: &str, key: &str) -> Option { + let regex = Self::regex(&format!(r#""{key}"\s*:\s*"([^"]+)""#)).ok()?; + regex + .captures(html) + .and_then(|value| value.get(1)) + .map(|value| value.as_str().to_string()) + } + + fn extract_js_value(html: &str, key: &str) -> Option { + let regex = Self::regex(&format!(r#"{key}\s*:\s*'((?:\\'|[^'])*)'"#)).ok()?; + regex + .captures(html) + .and_then(|value| value.get(1)) + .map(|value| value.as_str().replace("\\'", "'")) + } + + fn encode_search_query(query: &str) -> String { + let mut serializer = url::form_urlencoded::Serializer::new(String::new()); + serializer.append_pair("q", query); + serializer + .finish() + .strip_prefix("q=") + .unwrap_or_default() + .to_string() + } + + fn build_search_path_query(query: &str) -> String { + Self::normalize_title(query).replace(' ', "-") + } + + fn sort_param(sort: &str) -> &'static str { + match sort { + "popular" | "viewed" | "trending" => "video_viewed", + "rated" | "rating" | "top" => "rating", + "longest" | "duration" => "duration", + "commented" | "comments" => "most_commented", + "recommended" | "favorited" | "favourited" => "most_favourited", + _ => "post_date", + } + } + + fn default_archive_url_for_sort(&self, sort: &str) -> String { + match sort { + "rated" | "rating" | "top" => format!("{}/top-rated/", self.url), + "new" | "latest" | "date" | "recent" | "" => format!("{}/latest-updates/", self.url), + _ => format!("{}/most-popular/", self.url), + } + } + + fn build_latest_url(&self, page: u16) -> String { + format!( + "{}/latest-updates/?mode=async&function=get_block&block_id=list_videos_latest_videos_list&sort_by=post_date&from={page}", + self.url + ) + } + + fn build_archive_url(&self, archive_url: &str, page: u16, sort: &str) -> String { + let block_id = if archive_url.contains("/latest-updates/") { + "list_videos_latest_videos_list" + } else { + "list_videos_common_videos_list" + }; + let page_key = if block_id == "list_videos_latest_videos_list" { + "from" + } else { + "from" + }; + format!( + "{archive_url}?mode=async&function=get_block&block_id={block_id}&sort_by={}&{page_key}={page}", + Self::sort_param(sort) + ) + } + + fn build_search_url(&self, query: &str, page: u16, sort: &str) -> String { + let path_query = Self::build_search_path_query(query); + let encoded_query = Self::encode_search_query(query); + format!( + "{}/search/{path_query}/?mode=async&function=get_block&block_id=list_videos_videos_list_search_result&q={encoded_query}&category_ids=&sort_by={}&from_videos={page}&from_albums={page}", + self.url, + Self::sort_param(sort) + ) + } + + fn match_filter(options: &[FilterOption], query: &str) -> Option { + let normalized_query = Self::normalize_title(query); + options + .iter() + .find(|value| { + value.id != "all" && Self::normalize_title(&value.title) == normalized_query + }) + .map(|value| value.id.clone()) + } + + fn push_unique(target: &Arc>>, item: FilterOption) { + if item.id.is_empty() || item.title.is_empty() { + return; + } + if let Ok(mut values) = target.write() { + if !values.iter().any(|value| value.id == item.id) { + values.push(item); + } + } + } + + async fn fetch_html(requester: &mut Requester, url: &str) -> Result { + requester + .get(url, None) + .await + .map_err(|error| Error::from(format!("request failed for {url}: {error}"))) + } + + async fn load_categories( + base_url: &str, + categories: Arc>>, + uploaders: Arc>>, + ) -> Result<()> { + let mut requester = Requester::new(); + let html = Self::fetch_html(&mut requester, &format!("{base_url}/categories/")).await?; + let document = Html::parse_document(&html); + let selector = Self::selector("#list_categories_categories_list_items a.item[href]")?; + + for element in document.select(&selector) { + let href = element.value().attr("href").unwrap_or_default().trim_end_matches('/'); + if !href.starts_with(&format!("{base_url}/categories/")) { + continue; + } + let remainder = href + .strip_prefix(&format!("{base_url}/categories/")) + .unwrap_or_default(); + if remainder.is_empty() || remainder.contains('/') { + continue; + } + + let title = element + .value() + .attr("title") + .map(Self::clean_title) + .filter(|value| !value.is_empty()) + .unwrap_or_else(|| Self::text_of(&element)); + if title.is_empty() { + continue; + } + + let option = FilterOption { + id: format!("{href}/"), + title, + }; + Self::push_unique(&categories, option.clone()); + Self::push_unique(&uploaders, option); + } + + Ok(()) + } + + async fn load_tags(base_url: &str, tags: Arc>>) -> Result<()> { + let mut requester = Requester::new(); + let html = Self::fetch_html(&mut requester, &format!("{base_url}/tags/")).await?; + let document = Html::parse_document(&html); + let selector = Self::selector("a[href]")?; + + for element in document.select(&selector) { + let href = element.value().attr("href").unwrap_or_default().trim_end_matches('/'); + if !href.starts_with(&format!("{base_url}/tags/")) { + continue; + } + let remainder = href + .strip_prefix(&format!("{base_url}/tags/")) + .unwrap_or_default(); + if remainder.is_empty() || remainder.contains('/') { + continue; + } + + let title = element + .value() + .attr("title") + .map(Self::clean_title) + .filter(|value| !value.is_empty()) + .unwrap_or_else(|| Self::text_of(&element)); + if title.is_empty() { + continue; + } + + Self::push_unique( + &tags, + FilterOption { + id: format!("{href}/"), + title, + }, + ); + } + + Ok(()) + } + + fn filters_need_refresh(&self) -> bool { + let categories_len = self + .categories + .read() + .map(|values| values.len()) + .unwrap_or_default(); + let tags_len = self.tags.read().map(|values| values.len()).unwrap_or_default(); + let uploaders_len = self + .uploaders + .read() + .map(|values| values.len()) + .unwrap_or_default(); + categories_len <= 1 || tags_len <= 1 || uploaders_len <= 1 + } + + async fn refresh_filter_catalogs(&self) { + if let Err(error) = Self::load_categories( + &self.url, + Arc::clone(&self.categories), + Arc::clone(&self.uploaders), + ) + .await + { + report_provider_error_background( + CHANNEL_ID, + "refresh_filter_catalogs.categories", + &error.to_string(), + ); + } + if let Err(error) = Self::load_tags(&self.url, Arc::clone(&self.tags)).await { + report_provider_error_background( + CHANNEL_ID, + "refresh_filter_catalogs.tags", + &error.to_string(), + ); + } + } + + fn resolve_option_target(&self, sort: &str, options: &ServerOptions) -> Target { + if let Some(uploader) = options.sites.as_deref() { + if uploader.starts_with(&self.url) && uploader != "all" { + return Target::Archive(uploader.to_string()); + } + } + + if let Some(tag) = options.filter.as_deref() { + if tag.starts_with(&self.url) && tag != "all" { + return Target::Archive(tag.to_string()); + } + } + + if let Some(category) = options.categories.as_deref() { + if category.starts_with(&self.url) && category != "all" { + return Target::Archive(category.to_string()); + } + } + + if matches!(sort, "new" | "latest" | "date" | "recent" | "") { + return Target::Latest; + } + + Target::Archive(self.default_archive_url_for_sort(sort)) + } + + fn resolve_query_target(&self, query: &str) -> Target { + if let Ok(uploaders) = self.uploaders.read() { + if let Some(value) = Self::match_filter(&uploaders, query) { + return Target::Archive(value); + } + } + + if let Ok(tags) = self.tags.read() { + if let Some(value) = Self::match_filter(&tags, query) { + return Target::Archive(value); + } + } + + if let Ok(categories) = self.categories.read() { + if let Some(value) = Self::match_filter(&categories, query) { + return Target::Archive(value); + } + } + + Target::Search(query.to_string()) + } + + fn build_target_request(&self, target: &Target, page: u16, sort: &str) -> String { + match target { + Target::Latest => self.build_latest_url(page), + Target::Archive(url) => self.build_archive_url(url, page, sort), + Target::Search(query) => self.build_search_url(query, page, sort), + } + } + + fn list_container<'a>(&self, document: &'a Html) -> Result>> { + for selector_text in [ + "#list_videos_latest_videos_list_items", + "#list_videos_common_videos_list_items", + "#list_videos_videos_list_search_result_items", + "#list_videos_related_videos_items", + "#list_videos_videos_watched_right_now_items", + ] { + let selector = Self::selector(selector_text)?; + if let Some(element) = document.select(&selector).next() { + return Ok(Some(element)); + } + } + Ok(None) + } + + fn parse_list_videos(&self, html: &str) -> Result> { + let document = Html::parse_document(html); + let Some(container) = self.list_container(&document)? else { + return Ok(vec![]); + }; + + let card_selector = Self::selector("div.item")?; + let link_selector = Self::selector("a[href*=\"/videos/\"]")?; + let image_selector = Self::selector("div.img img")?; + let title_selector = Self::selector("strong.title")?; + let duration_selector = Self::selector("div.duration")?; + let rating_selector = Self::selector("div.rating")?; + let views_selector = Self::selector("div.views")?; + + let mut items = Vec::new(); + + for card in container.select(&card_selector) { + let Some(link) = card.select(&link_selector).next() else { + continue; + }; + + let href = link.value().attr("href").unwrap_or_default(); + let page_url = self.normalize_url(href); + if page_url.is_empty() { + continue; + } + + let id = page_url + .trim_end_matches('/') + .split('/') + .nth_back(1) + .unwrap_or_default() + .to_string(); + if id.is_empty() { + continue; + } + + let title = card + .select(&title_selector) + .next() + .map(|value| Self::clean_title(&Self::text_of(&value))) + .filter(|value| !value.is_empty()) + .or_else(|| { + link.value() + .attr("title") + .map(Self::clean_title) + .filter(|value| !value.is_empty()) + }); + let Some(title) = title else { + continue; + }; + + let image = card.select(&image_selector).next(); + let thumb = image + .and_then(|value| { + value + .value() + .attr("data-webp") + .or_else(|| value.value().attr("data-original")) + .or_else(|| value.value().attr("src")) + }) + .map(|value| self.normalize_url(value)) + .unwrap_or_default(); + if thumb.is_empty() { + continue; + } + + let duration = card + .select(&duration_selector) + .next() + .map(|value| Self::parse_duration(&Self::text_of(&value))) + .unwrap_or(0); + let views = card + .select(&views_selector) + .next() + .and_then(|value| Self::parse_views(&Self::text_of(&value))); + let rating = card + .select(&rating_selector) + .next() + .and_then(|value| Self::parse_percent(&Self::text_of(&value))); + + let mut item = VideoItem::new( + id, + title, + page_url, + CHANNEL_ID.to_string(), + thumb, + duration, + ); + item.views = views; + item.rating = rating; + items.push(item); + } + + Ok(items) + } + + fn lookup_category_url_by_title(&self, title: &str) -> Option { + let categories = self.categories.read().ok()?; + categories + .iter() + .find(|value| { + value.id != "all" && Self::normalize_title(&value.title) == Self::normalize_title(title) + }) + .map(|value| value.id.clone()) + } + + fn lookup_tag_url_by_title(&self, title: &str) -> Option { + let tags = self.tags.read().ok()?; + tags.iter() + .find(|value| { + value.id != "all" && Self::normalize_title(&value.title) == Self::normalize_title(title) + }) + .map(|value| value.id.clone()) + } + + async fn enrich_video(&self, mut item: VideoItem, options: &ServerOptions) -> VideoItem { + let mut requester = requester_or_default(options, CHANNEL_ID, "enrich_video.requester"); + let html = match requester.get(&item.url, None).await { + Ok(value) => value, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "enrich_video.request", + &format!("url={}; error={error}", item.url), + ); + return item; + } + }; + + let document = Html::parse_document(&html); + let info_selector = match Self::selector(".block-details .info .item") { + Ok(value) => value, + Err(_) => return item, + }; + let span_selector = match Self::selector("span") { + Ok(value) => value, + Err(_) => return item, + }; + let category_selector = match Self::selector(".block-details .info a[href*=\"/categories/\"]") + { + Ok(value) => value, + Err(_) => return item, + }; + let tag_selector = match Self::selector(".block-details .info a[href*=\"/tags/\"]") { + Ok(value) => value, + Err(_) => return item, + }; + + if let Some(title) = Self::extract_json_string(&html, "name") + .or_else(|| Self::extract_json_string(&html, "headline")) + { + let title = Self::clean_title(&title); + if !title.is_empty() { + item.title = title; + } + } + + if let Some(preview) = Self::extract_js_value(&html, "preview_url") + .or_else(|| Self::extract_json_string(&html, "thumbnailUrl")) + { + let preview = self.normalize_url(&preview); + if !preview.is_empty() { + item.preview = Some(preview.clone()); + if item.thumb.is_empty() { + item.thumb = preview; + } + } + } + + if let (Some(width), Some(height)) = ( + Self::extract_js_value(&html, "player_width") + .and_then(|value| value.parse::().ok()), + Self::extract_js_value(&html, "player_height") + .and_then(|value| value.parse::().ok()), + ) { + if width > 0.0 && height > 0.0 { + item.aspectRatio = Some(width / height); + } + } + + if let Some(duration) = Self::extract_json_string(&html, "duration") { + if let Some(duration) = Self::parse_iso8601_duration_seconds(&duration) { + if duration > 0 { + item.duration = duration; + } + } + } + + if let Some(uploaded_at) = Self::parse_upload_date_timestamp(&html) { + item.uploadedAt = Some(uploaded_at); + } + + let watch_action_regex = match Self::regex( + r#"(?s)"interactionType"\s*:\s*"http://schema.org/WatchAction".*?"userInteractionCount"\s*:\s*"(\d+)""#, + ) { + Ok(value) => value, + Err(_) => return item, + }; + if let Some(views) = watch_action_regex + .captures(&html) + .and_then(|value| value.get(1)) + .and_then(|value| value.as_str().parse::().ok()) + { + item.views = Some(views); + } + + let mut category_entries = Vec::<(String, String)>::new(); + let mut tag_entries = Vec::<(String, String)>::new(); + + for element in document.select(&category_selector) { + let title = Self::clean_title(&Self::text_of(&element)); + if title.is_empty() { + continue; + } + let url = self.normalize_url(element.value().attr("href").unwrap_or_default()); + if url.is_empty() { + continue; + } + category_entries.push((title.clone(), url.clone())); + Self::push_unique( + &self.categories, + FilterOption { + id: url.clone(), + title: title.clone(), + }, + ); + Self::push_unique( + &self.uploaders, + FilterOption { + id: url, + title, + }, + ); + } + + for element in document.select(&tag_selector) { + let title = Self::clean_title(&Self::text_of(&element)); + if title.is_empty() { + continue; + } + let url = self.normalize_url(element.value().attr("href").unwrap_or_default()); + if url.is_empty() { + continue; + } + tag_entries.push((title.clone(), url.clone())); + Self::push_unique( + &self.tags, + FilterOption { + id: url, + title, + }, + ); + } + + if category_entries.is_empty() { + if let Some(category_text) = Self::extract_js_value(&html, "video_categories") { + for raw in category_text.split(',') { + let title = Self::clean_title(raw); + if title.is_empty() { + continue; + } + let url = self.lookup_category_url_by_title(&title).unwrap_or_default(); + category_entries.push((title, url)); + } + } + } + + if tag_entries.is_empty() { + if let Some(tag_text) = Self::extract_js_value(&html, "video_tags") { + for raw in tag_text.split(',') { + let title = Self::clean_title(raw); + if title.is_empty() { + continue; + } + let url = self.lookup_tag_url_by_title(&title).unwrap_or_default(); + tag_entries.push((title, url)); + } + } + } + + if let Some((uploader, uploader_url)) = category_entries.first() { + item.uploader = Some(uploader.clone()); + if !uploader_url.is_empty() { + item.uploaderUrl = Some(uploader_url.clone()); + } + } + + let mut tag_values = category_entries + .iter() + .map(|(title, _)| title.clone()) + .collect::>(); + tag_values.extend(tag_entries.into_iter().map(|(title, _)| title)); + tag_values.sort(); + tag_values.dedup(); + if !tag_values.is_empty() { + item.tags = Some(tag_values); + } + + for info in document.select(&info_selector) { + for span in info.select(&span_selector) { + let text = Self::text_of(&span); + if let Some(value) = text.strip_prefix("Duration:") { + let duration = Self::parse_duration(value.trim()); + if duration > 0 { + item.duration = duration; + } + } else if let Some(value) = text.strip_prefix("Views:") { + if let Some(views) = Self::parse_views(value.trim()) { + item.views = Some(views); + } + } + } + } + + let rating_selector = match Self::selector(".rating-container .voters, .rating-container .rating") { + Ok(value) => value, + Err(_) => return item, + }; + for element in document.select(&rating_selector) { + let text = Self::text_of(&element); + if let Some(rating) = Self::parse_percent(&text) { + item.rating = Some(rating); + break; + } + } + + item + } + + async fn fetch_items_for_url( + &self, + cache: VideoCache, + url: String, + per_page_limit: usize, + options: &ServerOptions, + ) -> Result> { + if let Some((time, items)) = cache.get(&url) { + if time.elapsed().unwrap_or_default().as_secs() < 60 * 5 { + return Ok(items.into_iter().take(per_page_limit.max(1)).collect()); + } + } + + let mut requester = requester_or_default(options, CHANNEL_ID, "fetch_items_for_url.requester"); + let html = Self::fetch_html(&mut requester, &url).await?; + let list_items = self.parse_list_videos(&html)?; + if list_items.is_empty() { + return Ok(vec![]); + } + + let enriched = stream::iter(list_items.into_iter().map(|item| { + let provider = self.clone(); + let options = options.clone(); + async move { provider.enrich_video(item, &options).await } + })) + .buffer_unordered(4) + .collect::>() + .await; + + if !enriched.is_empty() { + cache.remove(&url); + cache.insert(url, enriched.clone()); + } + + Ok(enriched.into_iter().take(per_page_limit.max(1)).collect()) + } + + async fn get( + &self, + cache: VideoCache, + page: u16, + sort: &str, + per_page_limit: usize, + options: ServerOptions, + ) -> Result> { + let target = self.resolve_option_target(sort, &options); + let url = self.build_target_request(&target, page, sort); + self.fetch_items_for_url(cache, url, per_page_limit, &options) + .await + } + + async fn query( + &self, + cache: VideoCache, + page: u16, + sort: &str, + query: &str, + per_page_limit: usize, + options: ServerOptions, + ) -> Result> { + let target = self.resolve_query_target(query); + let url = self.build_target_request(&target, page, sort); + self.fetch_items_for_url(cache, url, per_page_limit, &options) + .await + } +} + +#[async_trait] +impl Provider for ArabpornxxxProvider { + async fn get_videos( + &self, + cache: VideoCache, + pool: DbPool, + sort: String, + query: Option, + page: String, + per_page: String, + options: ServerOptions, + ) -> Vec { + let _ = pool; + let page = page.parse::().unwrap_or(1); + let per_page_limit = per_page.parse::().unwrap_or(30); + + if self.filters_need_refresh() { + self.refresh_filter_catalogs().await; + } + + let result = match query { + Some(query) if !query.trim().is_empty() => { + self.query(cache, page, &sort, &query, per_page_limit, options) + .await + } + _ => self.get(cache, page, &sort, per_page_limit, options).await, + }; + + match result { + Ok(videos) => videos, + Err(error) => { + report_provider_error(CHANNEL_ID, "get_videos", &error.to_string()).await; + vec![] + } + } + } + + fn get_channel(&self, clientversion: ClientVersion) -> Option { + Some(self.build_channel(clientversion)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn provider() -> ArabpornxxxProvider { + ArabpornxxxProvider { + url: BASE_URL.to_string(), + categories: Arc::new(RwLock::new(vec![ + FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }, + FilterOption { + id: format!("{BASE_URL}/categories/hijab-mylfs/"), + title: "Hijab Mylfs".to_string(), + }, + ])), + tags: Arc::new(RwLock::new(vec![ + FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }, + FilterOption { + id: format!("{BASE_URL}/tags/arabic-porn/"), + title: "Arabic Porn".to_string(), + }, + ])), + uploaders: Arc::new(RwLock::new(vec![ + FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }, + FilterOption { + id: format!("{BASE_URL}/categories/hijab-mylfs/"), + title: "Hijab Mylfs".to_string(), + }, + ])), + } + } + + #[test] + fn builds_search_page_two_url() { + let provider = provider(); + assert_eq!( + provider.build_search_url("arab hijab", 2, "recommended"), + "https://arabporn.xxx/search/arab-hijab/?mode=async&function=get_block&block_id=list_videos_videos_list_search_result&q=arab+hijab&category_ids=&sort_by=most_favourited&from_videos=2&from_albums=2" + ); + } + + #[test] + fn resolves_known_tag_query_to_archive() { + let provider = provider(); + match provider.resolve_query_target("arabic porn") { + Target::Archive(url) => { + assert_eq!(url, "https://arabporn.xxx/tags/arabic-porn/"); + } + _ => panic!("expected archive target"), + } + } + + #[test] + fn defaults_non_new_sorts_to_common_archive_root() { + let provider = provider(); + match provider.resolve_option_target( + "recommended", + &ServerOptions { + featured: None, + category: None, + sites: None, + filter: None, + language: None, + public_url_base: None, + requester: None, + network: None, + stars: None, + categories: None, + duration: None, + sort: None, + sexuality: None, + }, + ) { + Target::Archive(url) => { + assert_eq!(url, "https://arabporn.xxx/most-popular/"); + } + _ => panic!("expected archive target"), + } + } +} diff --git a/src/providers/pornhd3x.rs b/src/providers/pornhd3x.rs new file mode 100644 index 0000000..53666df --- /dev/null +++ b/src/providers/pornhd3x.rs @@ -0,0 +1,1298 @@ +use crate::DbPool; +use crate::api::ClientVersion; +use crate::providers::{ + Provider, report_provider_error, report_provider_error_background, requester_or_default, +}; +use crate::status::*; +use crate::util::cache::VideoCache; +use crate::videos::{ServerOptions, VideoEmbed, VideoFormat, VideoItem}; +use async_trait::async_trait; +use chrono::{NaiveDate, TimeZone, Utc}; +use error_chain::error_chain; +use futures::stream::{self, StreamExt}; +use htmlentity::entity::{ICodedDataTrait, decode}; +use regex::Regex; +use scraper::{ElementRef, Html, Selector}; +use serde_json::Value; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Arc, RwLock}; +use std::{thread, vec}; +use url::Url; +use wreq::Version; + +pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata = + crate::providers::ProviderChannelMetadata { + group_id: "studio-network", + tags: &["premium", "studio", "aggregator"], + }; + +error_chain! { + foreign_links { + Io(std::io::Error); + Json(serde_json::Error); + } + errors { + Parse(msg: String) { + description("parse error") + display("parse error: {}", msg) + } + } +} + +const BASE_URL: &str = "https://www.pornhd3x.tv"; +const CHANNEL_ID: &str = "pornhd3x"; +const HOME_ARCHIVE_PATH: &str = "/premium-porn-hd"; +const SOURCE_SECRET: &str = "98126avrbi6m49vd7shxkn985"; +const SOURCE_COOKIE_PREFIX: &str = "826avrbi6m49vd7shxkn985m"; +const SOURCE_COOKIE_SUFFIX: &str = "k06twz87wwxtp3dqiicks2df"; +const RECENT_TAG_DETAIL_LIMIT: usize = 8; +const DETAIL_CONCURRENCY: usize = 4; + +#[derive(Debug, Clone)] +pub struct Pornhd3xProvider { + url: String, + categories: Arc>>, + tags: Arc>>, + stars: Arc>>, + uploaders: Arc>>, + source_counter: Arc, +} + +#[derive(Debug, Clone)] +enum Target { + Latest, + Search(String), + Archive(String), +} + +#[derive(Debug, Clone)] +struct ListStub { + slug: String, + detail_url: String, + title: String, + thumb: String, + preview: Option, +} + +#[derive(Debug, Clone, Default)] +struct DetailMetadata { + title: Option, + description: Option, + thumb: Option, + studio: Option<(String, String)>, + categories: Vec<(String, String)>, + actors: Vec<(String, String)>, + tags: Vec<(String, String)>, + uploaded_at: Option, + episode_id: Option, +} + +impl Pornhd3xProvider { + pub fn new() -> Self { + let provider = Self { + url: BASE_URL.to_string(), + categories: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + tags: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + stars: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + uploaders: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + source_counter: Arc::new(AtomicU32::new(0)), + }; + provider.spawn_initial_load(); + provider + } + + fn spawn_initial_load(&self) { + let provider = self.clone(); + thread::spawn(move || { + let runtime = match tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + { + Ok(runtime) => runtime, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "spawn_initial_load.runtime_build", + &error.to_string(), + ); + return; + } + }; + + runtime.block_on(async move { + provider.refresh_filter_catalogs().await; + }); + }); + } + + fn build_channel(&self, _clientversion: ClientVersion) -> Channel { + let categories = self + .categories + .read() + .map(|value| value.clone()) + .unwrap_or_default(); + let tags = self + .tags + .read() + .map(|value| value.clone()) + .unwrap_or_default(); + let stars = self + .stars + .read() + .map(|value| value.clone()) + .unwrap_or_default(); + let uploaders = self + .uploaders + .read() + .map(|value| value.clone()) + .unwrap_or_default(); + + Channel { + id: CHANNEL_ID.to_string(), + name: "PornHD3X".to_string(), + description: + "Premium studio archive pages with studio, category, pornstar, and keyword filters." + .to_string(), + premium: false, + favicon: "https://www.google.com/s2/favicons?sz=64&domain=www.pornhd3x.tv" + .to_string(), + status: "active".to_string(), + categories: categories.iter().map(|value| value.title.clone()).collect(), + options: vec![ + ChannelOption { + id: "sort".to_string(), + title: "Sort".to_string(), + description: + "Latest archive only. Extra top lists on the site currently error server-side." + .to_string(), + systemImage: "list.number".to_string(), + colorName: "blue".to_string(), + options: vec![FilterOption { + id: "new".to_string(), + title: "Latest".to_string(), + }], + multiSelect: false, + }, + ChannelOption { + id: "sites".to_string(), + title: "Uploaders".to_string(), + description: "Browse studio archives directly.".to_string(), + systemImage: "person.crop.square".to_string(), + colorName: "purple".to_string(), + options: uploaders, + multiSelect: false, + }, + ChannelOption { + id: "categories".to_string(), + title: "Categories".to_string(), + description: "Browse a category archive.".to_string(), + systemImage: "square.grid.2x2".to_string(), + colorName: "orange".to_string(), + options: categories, + multiSelect: false, + }, + ChannelOption { + id: "stars".to_string(), + title: "Pornstars".to_string(), + description: "Browse a pornstar archive.".to_string(), + systemImage: "star.fill".to_string(), + colorName: "yellow".to_string(), + options: stars, + multiSelect: false, + }, + ChannelOption { + id: "filter".to_string(), + title: "Tags".to_string(), + description: "Browse discovered keyword archives.".to_string(), + systemImage: "tag.fill".to_string(), + colorName: "green".to_string(), + options: tags, + multiSelect: false, + }, + ], + nsfw: true, + cacheDuration: Some(1800), + } + } + + fn selector(value: &str) -> Result { + Selector::parse(value) + .map_err(|error| Error::from(format!("selector `{value}` parse failed: {error}"))) + } + + fn regex(value: &str) -> Result { + Regex::new(value).map_err(|error| Error::from(format!("regex `{value}` failed: {error}"))) + } + + fn collapse_whitespace(text: &str) -> String { + text.split_whitespace().collect::>().join(" ") + } + + fn decode_html(text: &str) -> String { + decode(text.as_bytes()) + .to_string() + .unwrap_or_else(|_| text.to_string()) + } + + fn text_of(element: &ElementRef<'_>) -> String { + Self::decode_html(&Self::collapse_whitespace( + &element.text().collect::>().join(" "), + )) + } + + fn titleize_slug(slug: &str) -> String { + slug.split(['-', '_']) + .filter(|part| !part.is_empty()) + .map(|part| { + let mut chars = part.chars(); + match chars.next() { + Some(first) => { + let mut word = first.to_uppercase().collect::(); + word.push_str(chars.as_str()); + word + } + None => String::new(), + } + }) + .collect::>() + .join(" ") + } + + fn normalize_title(value: &str) -> String { + value + .to_ascii_lowercase() + .replace('&', " and ") + .chars() + .map(|ch| if ch.is_ascii_alphanumeric() { ch } else { ' ' }) + .collect::() + .split_whitespace() + .collect::>() + .join(" ") + } + + fn push_unique(target: &Arc>>, item: FilterOption) { + if item.id.is_empty() || item.title.is_empty() { + return; + } + if let Ok(mut values) = target.write() { + if !values.iter().any(|value| value.id == item.id) { + values.push(item); + } + } + } + + fn normalize_url(&self, raw: &str) -> String { + let value = raw.trim(); + if value.is_empty() { + return String::new(); + } + + if value.starts_with("//") { + return format!("https:{value}"); + } + + if let Ok(url) = Url::parse(value) { + if let Some(host) = url.host_str() { + if host.contains("pornhd3x.tv") || host.contains("brazzers3x.") { + return format!( + "{}{}{}", + self.url, + url.path(), + url.query() + .map(|query| format!("?{query}")) + .unwrap_or_default() + ); + } + } + if value.starts_with("http://") { + return value.replacen("http://", "https://", 1); + } + return value.to_string(); + } + + if value.starts_with('/') { + return format!("{}{}", self.url, value); + } + + format!("{}/{}", self.url.trim_end_matches('/'), value) + } + + fn sanitize_search_query(query: &str) -> String { + query + .chars() + .map(|ch| { + if ch.is_ascii_alphanumeric() || ch == ' ' || ch == '-' { + ch + } else { + ' ' + } + }) + .collect::() + .split_whitespace() + .collect::>() + .join(" ") + } + + fn build_home_url(&self, page: u32) -> String { + if page > 1 { + format!("{}{}/page-{}", self.url, HOME_ARCHIVE_PATH, page) + } else { + format!("{}{}", self.url, HOME_ARCHIVE_PATH) + } + } + + fn build_search_url(&self, query: &str, page: u32) -> String { + let query = Self::sanitize_search_query(query).replace(' ', "%20"); + if page > 1 { + format!("{}/search/{}/page-{}", self.url, query, page) + } else { + format!("{}/search/{}", self.url, query) + } + } + + fn build_archive_page_url(base: &str, page: u32) -> String { + let base = base.trim_end_matches('/'); + if page > 1 { + format!("{base}/page-{page}") + } else { + base.to_string() + } + } + + fn is_allowed_archive_url(&self, value: &str) -> bool { + let normalized = self.normalize_url(value); + normalized.starts_with(&self.url) + && [ + "/studio/", + "/category/", + "/tag/", + "/pornstar/", + HOME_ARCHIVE_PATH, + ] + .iter() + .any(|prefix| normalized.contains(prefix)) + } + + fn match_filter(options: &[FilterOption], query: &str) -> Option { + let normalized_query = Self::normalize_title(query); + options + .iter() + .find(|value| { + value.id != "all" && Self::normalize_title(&value.title) == normalized_query + }) + .map(|value| value.id.clone()) + } + + fn filters_need_refresh(&self) -> bool { + let categories_len = self + .categories + .read() + .map(|values| values.len()) + .unwrap_or_default(); + let tags_len = self + .tags + .read() + .map(|values| values.len()) + .unwrap_or_default(); + let stars_len = self + .stars + .read() + .map(|values| values.len()) + .unwrap_or_default(); + let uploaders_len = self + .uploaders + .read() + .map(|values| values.len()) + .unwrap_or_default(); + + categories_len <= 1 || tags_len <= 1 || stars_len <= 1 || uploaders_len <= 1 + } + + async fn refresh_filter_catalogs(&self) { + if let Err(error) = self.load_home_catalogs().await { + report_provider_error_background( + CHANNEL_ID, + "refresh_filter_catalogs.home", + &error.to_string(), + ); + } + if let Err(error) = self.load_sitemap_catalogs().await { + report_provider_error_background( + CHANNEL_ID, + "refresh_filter_catalogs.sitemap", + &error.to_string(), + ); + } + if let Err(error) = self.load_recent_tags().await { + report_provider_error_background( + CHANNEL_ID, + "refresh_filter_catalogs.recent_tags", + &error.to_string(), + ); + } + } + + async fn ensure_catalogs(&self) { + if self.filters_need_refresh() { + self.refresh_filter_catalogs().await; + } + } + + async fn load_home_catalogs(&self) -> Result<()> { + let mut requester = requester_or_default( + &ServerOptions { + featured: None, + category: None, + sites: None, + filter: None, + language: None, + public_url_base: None, + requester: None, + network: None, + stars: None, + categories: None, + duration: None, + sort: None, + sexuality: None, + }, + CHANNEL_ID, + "load_home_catalogs.requester", + ); + let html = requester + .get(BASE_URL, Some(Version::HTTP_11)) + .await + .map_err(|error| Error::from(error.to_string()))?; + let document = Html::parse_document(&html); + + let category_selector = Self::selector("#menu a.ml-item[href^=\"/category/\"]")?; + let studio_selector = Self::selector("#menu a.ml-item[href^=\"/studio/\"]")?; + + for element in document.select(&category_selector) { + let Some(href) = element.value().attr("href") else { + continue; + }; + let title = Self::text_of(&element); + if title.is_empty() { + continue; + } + Self::push_unique( + &self.categories, + FilterOption { + id: self.normalize_url(href), + title, + }, + ); + } + + for element in document.select(&studio_selector) { + let Some(href) = element.value().attr("href") else { + continue; + }; + let title = Self::text_of(&element); + if title.is_empty() { + continue; + } + Self::push_unique( + &self.uploaders, + FilterOption { + id: self.normalize_url(href), + title, + }, + ); + } + + Ok(()) + } + + async fn load_sitemap_catalogs(&self) -> Result<()> { + let mut requester = requester_or_default( + &ServerOptions { + featured: None, + category: None, + sites: None, + filter: None, + language: None, + public_url_base: None, + requester: None, + network: None, + stars: None, + categories: None, + duration: None, + sort: None, + sexuality: None, + }, + CHANNEL_ID, + "load_sitemap_catalogs.requester", + ); + let xml = requester + .get(&format!("{}/sitemap.xml", self.url), Some(Version::HTTP_11)) + .await + .map_err(|error| Error::from(error.to_string()))?; + let loc_regex = Self::regex(r"([^<]+)")?; + + for captures in loc_regex.captures_iter(&xml) { + let Some(raw_url) = captures.get(1).map(|value| value.as_str()) else { + continue; + }; + let normalized = self.normalize_url(raw_url); + let Some(parsed) = Url::parse(&normalized).ok() else { + continue; + }; + let path = parsed.path().trim_end_matches('/'); + + if let Some(slug) = path.strip_prefix("/studio/") { + Self::push_unique( + &self.uploaders, + FilterOption { + id: normalized.clone(), + title: Self::titleize_slug(slug), + }, + ); + } else if let Some(slug) = path.strip_prefix("/pornstar/") { + Self::push_unique( + &self.stars, + FilterOption { + id: normalized.clone(), + title: Self::titleize_slug(slug), + }, + ); + } + } + + Ok(()) + } + + async fn load_recent_tags(&self) -> Result<()> { + let mut requester = requester_or_default( + &ServerOptions { + featured: None, + category: None, + sites: None, + filter: None, + language: None, + public_url_base: None, + requester: None, + network: None, + stars: None, + categories: None, + duration: None, + sort: None, + sexuality: None, + }, + CHANNEL_ID, + "load_recent_tags.requester", + ); + let html = requester + .get(&self.build_home_url(1), Some(Version::HTTP_11)) + .await + .map_err(|error| Error::from(error.to_string()))?; + let stubs = self.parse_list_items(&html)?; + for stub in stubs.into_iter().take(RECENT_TAG_DETAIL_LIMIT) { + let detail_html = requester + .get(&stub.detail_url, Some(Version::HTTP_11)) + .await + .map_err(|error| Error::from(error.to_string()))?; + if let Ok(metadata) = self.parse_detail_metadata(&detail_html) { + self.store_detail_filters(&metadata); + } + } + Ok(()) + } + + fn resolve_option_target(&self, options: &ServerOptions) -> Target { + if let Some(value) = options.sites.as_deref() { + if value != "all" && self.is_allowed_archive_url(value) { + return Target::Archive(self.normalize_url(value)); + } + } + if let Some(value) = options.stars.as_deref() { + if value != "all" && self.is_allowed_archive_url(value) { + return Target::Archive(self.normalize_url(value)); + } + } + if let Some(value) = options.filter.as_deref() { + if value != "all" && self.is_allowed_archive_url(value) { + return Target::Archive(self.normalize_url(value)); + } + } + if let Some(value) = options.categories.as_deref() { + if value != "all" && self.is_allowed_archive_url(value) { + return Target::Archive(self.normalize_url(value)); + } + } + Target::Latest + } + + fn resolve_query_target(&self, query: &str) -> Target { + if let Ok(uploaders) = self.uploaders.read() { + if let Some(target) = Self::match_filter(&uploaders, query) { + return Target::Archive(target); + } + } + if let Ok(stars) = self.stars.read() { + if let Some(target) = Self::match_filter(&stars, query) { + return Target::Archive(target); + } + } + if let Ok(tags) = self.tags.read() { + if let Some(target) = Self::match_filter(&tags, query) { + return Target::Archive(target); + } + } + if let Ok(categories) = self.categories.read() { + if let Some(target) = Self::match_filter(&categories, query) { + return Target::Archive(target); + } + } + Target::Search(query.to_string()) + } + + fn target_url(&self, target: &Target, page: u32) -> String { + match target { + Target::Latest => self.build_home_url(page), + Target::Search(query) => self.build_search_url(query, page), + Target::Archive(url) => Self::build_archive_page_url(url, page), + } + } + + fn parse_list_items(&self, html: &str) -> Result> { + let document = Html::parse_document(html); + let item_selector = Self::selector(".movies-list .ml-item")?; + let anchor_selector = Self::selector("a.ml-mask[href]")?; + let title_selector = Self::selector(".mli-info h2")?; + let img_selector = Self::selector("img[data-original], img[src]")?; + let preview_holder_selector = Self::selector(".thumb__img[data-preview]")?; + let preview_icon_selector = + Self::selector("span.player_icon[str], span.player_icon1[str]")?; + + let mut items = Vec::new(); + + for element in document.select(&item_selector) { + let Some(anchor) = element.select(&anchor_selector).next() else { + continue; + }; + let Some(href) = anchor.value().attr("href") else { + continue; + }; + + let detail_url = self.normalize_url(href); + if detail_url.is_empty() || !detail_url.contains("/movies/") { + continue; + } + + let title = element + .select(&title_selector) + .next() + .map(|value| Self::text_of(&value)) + .filter(|value| !value.is_empty()) + .or_else(|| anchor.value().attr("title").map(Self::decode_html)) + .unwrap_or_default(); + if title.is_empty() { + continue; + } + + let thumb = element + .select(&img_selector) + .next() + .and_then(|value| { + value + .value() + .attr("data-original") + .or_else(|| value.value().attr("src")) + }) + .map(|value| self.normalize_url(value)) + .unwrap_or_default(); + + let preview = element + .select(&preview_holder_selector) + .next() + .and_then(|value| value.value().attr("data-preview")) + .or_else(|| { + element + .select(&preview_icon_selector) + .next() + .and_then(|value| value.value().attr("str")) + }) + .map(|value| self.normalize_url(value)) + .filter(|value| !value.is_empty()); + + let slug = detail_url + .trim_end_matches('/') + .rsplit('/') + .next() + .unwrap_or_default() + .to_string(); + if slug.is_empty() { + continue; + } + + items.push(ListStub { + slug, + detail_url, + title, + thumb, + preview, + }); + } + + Ok(items) + } + + fn parse_uploaded_at(title: &str) -> Option { + let patterns = [ + r"(?P\d{1,2})\.(?P\d{1,2})\.(?P\d{4})", + r"(?P\d{1,2})-(?P\d{1,2})-(?P\d{4})", + r"\((?P\d{1,2})-(?P\d{1,2})-(?P\d{4})\)", + ]; + + for pattern in patterns { + let Ok(regex) = Regex::new(pattern) else { + continue; + }; + let Some(captures) = regex.captures(title) else { + continue; + }; + let day = captures.name("day")?.as_str().parse::().ok()?; + let month = captures.name("month")?.as_str().parse::().ok()?; + let year = captures.name("year")?.as_str().parse::().ok()?; + let date = NaiveDate::from_ymd_opt(year, month, day)?; + let datetime = Utc.from_utc_datetime(&date.and_hms_opt(0, 0, 0)?); + return Some(datetime.timestamp() as u64); + } + + None + } + + fn parse_detail_metadata(&self, html: &str) -> Result { + let document = Html::parse_document(html); + let title_selector = Self::selector(".mvic-desc h3")?; + let description_selector = Self::selector("meta[name=\"description\"]")?; + let og_image_selector = Self::selector("meta[property=\"og:image\"]")?; + let studio_selector = Self::selector("#bread a[href*=\"/studio/\"]")?; + let category_selector = Self::selector(".mvici-left a[href*=\"/category/\"]")?; + let actor_selector = Self::selector(".mvici-left a[href*=\"/pornstar/\"]")?; + let tag_selector = Self::selector("#mv-keywords a[href*=\"/tag/\"]")?; + let episode_selector = Self::selector("#uuid, a.btn-eps[episode-id]")?; + + let title = document + .select(&title_selector) + .next() + .map(|value| Self::text_of(&value)) + .filter(|value| !value.is_empty()); + + let description = document + .select(&description_selector) + .next() + .and_then(|value| value.value().attr("content")) + .map(Self::decode_html) + .filter(|value| !value.is_empty()); + + let thumb = document + .select(&og_image_selector) + .next() + .and_then(|value| value.value().attr("content")) + .map(|value| self.normalize_url(value)) + .filter(|value| !value.is_empty()); + + let studio = document.select(&studio_selector).next().and_then(|value| { + let href = value.value().attr("href")?; + let title = Self::text_of(&value); + (!title.is_empty()).then_some((self.normalize_url(href), title)) + }); + + let categories = document + .select(&category_selector) + .filter_map(|value| { + let href = value.value().attr("href")?; + let title = Self::text_of(&value); + (!title.is_empty()).then_some((self.normalize_url(href), title)) + }) + .collect::>(); + + let actors = document + .select(&actor_selector) + .filter_map(|value| { + let href = value.value().attr("href")?; + let title = Self::text_of(&value); + (!title.is_empty()).then_some((self.normalize_url(href), title)) + }) + .collect::>(); + + let tags = document + .select(&tag_selector) + .filter_map(|value| { + let href = value.value().attr("href")?; + let title = Self::text_of(&value); + (!title.is_empty()).then_some((self.normalize_url(href), title)) + }) + .collect::>(); + + let episode_id = document + .select(&episode_selector) + .find_map(|value| { + value + .value() + .attr("value") + .or_else(|| value.value().attr("episode-id")) + }) + .map(ToOwned::to_owned); + + Ok(DetailMetadata { + uploaded_at: title.as_deref().and_then(Self::parse_uploaded_at), + title, + description, + thumb, + studio, + categories, + actors, + tags, + episode_id, + }) + } + + fn store_detail_filters(&self, metadata: &DetailMetadata) { + if let Some((url, title)) = &metadata.studio { + Self::push_unique( + &self.uploaders, + FilterOption { + id: url.clone(), + title: title.clone(), + }, + ); + } + + for (url, title) in &metadata.categories { + Self::push_unique( + &self.categories, + FilterOption { + id: url.clone(), + title: title.clone(), + }, + ); + } + + for (url, title) in &metadata.actors { + Self::push_unique( + &self.stars, + FilterOption { + id: url.clone(), + title: title.clone(), + }, + ); + } + + for (url, title) in &metadata.tags { + Self::push_unique( + &self.tags, + FilterOption { + id: url.clone(), + title: title.clone(), + }, + ); + } + } + + fn build_source_cookie_name(episode_id: &str) -> String { + format!("{SOURCE_COOKIE_PREFIX}{episode_id}{SOURCE_COOKIE_SUFFIX}") + } + + fn build_source_hash(episode_id: &str, nonce: &str) -> String { + format!( + "{:x}", + md5::compute(format!("{episode_id}{nonce}{SOURCE_SECRET}")) + ) + } + + fn next_source_request(&self) -> (u32, String) { + let count = self.source_counter.fetch_add(1, Ordering::Relaxed) + 1; + let nonce = format!("{:06x}", count % 0xFF_FFFF); + (count, nonce) + } + + async fn fetch_sources( + &self, + requester: &mut crate::util::requester::Requester, + referer: &str, + episode_id: &str, + ) -> Result { + let (count, nonce) = self.next_source_request(); + let source_url = format!( + "{}/ajax/get_sources/{}/{hash}?count={count}&mobile=true", + self.url, + episode_id, + hash = Self::build_source_hash(episode_id, &nonce), + count = count, + ); + let existing_cookie = requester.cookie_header_for_url(&source_url); + let cookie_name = Self::build_source_cookie_name(episode_id); + let cookie_value = format!("{cookie_name}={nonce}"); + let combined_cookie = match existing_cookie { + Some(existing) if !existing.trim().is_empty() => format!("{existing}; {cookie_value}"), + _ => cookie_value, + }; + + let response = requester + .get_with_headers( + &source_url, + vec![ + ("Cookie".to_string(), combined_cookie), + ("Referer".to_string(), referer.to_string()), + ("X-Requested-With".to_string(), "XMLHttpRequest".to_string()), + ( + "Accept".to_string(), + "application/json, text/javascript, */*; q=0.01".to_string(), + ), + ], + Some(Version::HTTP_11), + ) + .await + .map_err(|error| Error::from(error.to_string()))?; + + if response.trim().is_empty() { + return Err(Error::from("source payload empty")); + } + + Ok(serde_json::from_str::(&response)?) + } + + fn build_formats(&self, value: &Value) -> Vec { + let mut formats = Vec::new(); + for playlist in value + .get("playlist") + .and_then(|playlist| playlist.as_array()) + .into_iter() + .flatten() + { + for source in playlist + .get("sources") + .and_then(|sources| sources.as_array()) + .into_iter() + .flatten() + { + let Some(file) = source.get("file").and_then(|file| file.as_str()) else { + continue; + }; + let url = self.normalize_url(file); + if url.is_empty() { + continue; + } + let quality = source + .get("label") + .and_then(|label| label.as_str()) + .unwrap_or("HLS") + .to_string(); + let format_name = if url.contains(".m3u8") { "hls" } else { "mp4" }; + let format = VideoFormat::new(url, quality.clone(), format_name.to_string()) + .format_id(quality.to_ascii_lowercase()) + .format_note(quality); + formats.push(format); + } + } + formats + } + + async fn enrich_stub( + &self, + stub: ListStub, + options: &ServerOptions, + ) -> Result> { + let mut requester = requester_or_default(options, CHANNEL_ID, "enrich_stub.requester"); + let detail_html = requester + .get(&stub.detail_url, Some(Version::HTTP_11)) + .await + .map_err(|error| Error::from(error.to_string()))?; + let metadata = self.parse_detail_metadata(&detail_html)?; + self.store_detail_filters(&metadata); + + let Some(episode_id) = metadata.episode_id.clone() else { + return Ok(None); + }; + + let source_payload = match self + .fetch_sources(&mut requester, &stub.detail_url, &episode_id) + .await + { + Ok(payload) => payload, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "enrich_stub.fetch_sources", + &format!("detail_url={}; error={}", stub.detail_url, error), + ); + return Ok(None); + } + }; + + let mut formats = self.build_formats(&source_payload); + let direct_url = formats + .first() + .map(|format| format.url.clone()) + .or_else(|| { + source_payload + .get("embed_url") + .or_else(|| source_payload.get("embedUrl")) + .and_then(|value| value.as_str()) + .map(|value| self.normalize_url(value)) + }); + + let Some(url) = direct_url else { + return Ok(None); + }; + + let title = metadata + .title + .clone() + .filter(|value| !value.is_empty()) + .unwrap_or(stub.title.clone()); + let thumb = metadata + .thumb + .clone() + .filter(|value| !value.is_empty()) + .unwrap_or(stub.thumb.clone()); + + let mut item = VideoItem::new( + episode_id.clone(), + title.clone(), + url.clone(), + CHANNEL_ID.to_string(), + thumb, + 0, + ); + item.preview = stub.preview.clone(); + if let Some((uploader_url, uploader)) = &metadata.studio { + item.uploader = Some(uploader.clone()); + item.uploaderUrl = Some(uploader_url.clone()); + } + + let mut item_tags = metadata + .tags + .iter() + .map(|(_, title)| title.clone()) + .collect::>(); + item_tags.extend(metadata.categories.iter().map(|(_, title)| title.clone())); + item_tags.extend(metadata.actors.iter().map(|(_, title)| title.clone())); + item_tags.sort(); + item_tags.dedup(); + if !item_tags.is_empty() { + item.tags = Some(item_tags); + } + + item.uploadedAt = metadata.uploaded_at; + + if !formats.is_empty() { + item.formats = Some(std::mem::take(&mut formats)); + } + + if source_payload + .get("embed") + .and_then(|value| value.as_bool()) + .unwrap_or(false) + { + if let Some(embed_url) = source_payload + .get("embed_url") + .or_else(|| source_payload.get("embedUrl")) + .and_then(|value| value.as_str()) + { + item.embed = Some(VideoEmbed { + html: format!( + "", + self.normalize_url(embed_url) + ), + source: self.normalize_url(embed_url), + }); + } + } + + Ok(Some(item)) + } + + async fn fetch_items_for_target( + &self, + cache: VideoCache, + target: Target, + page: u32, + options: ServerOptions, + ) -> Result> { + let request_url = self.target_url(&target, page); + let old_items = match cache.get(&request_url) { + Some((time, items)) => { + if time.elapsed().unwrap_or_default().as_secs() < 60 * 30 { + return Ok(items.clone()); + } + items.clone() + } + None => vec![], + }; + + let mut requester = + requester_or_default(&options, CHANNEL_ID, "fetch_items_for_target.list"); + let html = match requester.get(&request_url, Some(Version::HTTP_11)).await { + Ok(html) => html, + Err(error) => { + report_provider_error( + CHANNEL_ID, + "fetch_items_for_target.list", + &format!("url={request_url}; error={error}"), + ) + .await; + return Ok(old_items); + } + }; + + let stubs = match self.parse_list_items(&html) { + Ok(stubs) => stubs, + Err(error) => { + report_provider_error( + CHANNEL_ID, + "fetch_items_for_target.parse_list", + &format!("url={request_url}; error={error}"), + ) + .await; + return Ok(old_items); + } + }; + + let videos = stream::iter(stubs.into_iter().map(|stub| { + let options = options.clone(); + async move { self.enrich_stub(stub, &options).await.ok().flatten() } + })) + .buffer_unordered(DETAIL_CONCURRENCY) + .filter_map(async move |value| value) + .collect::>() + .await; + + if !videos.is_empty() { + cache.remove(&request_url); + cache.insert(request_url, videos.clone()); + return Ok(videos); + } + + Ok(old_items) + } +} + +#[async_trait] +impl Provider for Pornhd3xProvider { + async fn get_videos( + &self, + cache: VideoCache, + _pool: DbPool, + _sort: String, + query: Option, + page: String, + _per_page: String, + options: ServerOptions, + ) -> Vec { + self.ensure_catalogs().await; + + let page = page + .parse::() + .ok() + .filter(|value| *value > 0) + .unwrap_or(1); + let target = match query + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()) + { + Some(query) => self.resolve_query_target(query), + None => self.resolve_option_target(&options), + }; + + match self + .fetch_items_for_target(cache, target, page, options) + .await + { + Ok(items) => items, + Err(error) => { + report_provider_error_background(CHANNEL_ID, "get_videos", &error.to_string()); + vec![] + } + } + } + + fn get_channel(&self, clientversion: ClientVersion) -> Option { + Some(self.build_channel(clientversion)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + impl Pornhd3xProvider { + fn new_for_tests() -> Self { + Self { + url: BASE_URL.to_string(), + categories: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + tags: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + stars: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + uploaders: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + source_counter: Arc::new(AtomicU32::new(0)), + } + } + } + + #[test] + fn builds_search_urls_like_live_site() { + let provider = Pornhd3xProvider::new_for_tests(); + assert_eq!( + provider.build_search_url("Brazzers", 1), + "https://www.pornhd3x.tv/search/Brazzers" + ); + assert_eq!( + provider.build_search_url("big tits", 3), + "https://www.pornhd3x.tv/search/big%20tits/page-3" + ); + } + + #[test] + fn builds_source_cookie_name_and_hash() { + assert_eq!( + Pornhd3xProvider::build_source_cookie_name("49Q27JL3HCPVNJQN"), + "826avrbi6m49vd7shxkn985m49Q27JL3HCPVNJQNk06twz87wwxtp3dqiicks2df" + ); + assert_eq!( + Pornhd3xProvider::build_source_hash("49Q27JL3HCPVNJQN", "abcdef"), + "8846c87b6e67760c42094713ec6f278a" + ); + } + + #[test] + fn parses_known_dates() { + assert!(Pornhd3xProvider::parse_uploaded_at( + "Brazzers / - Ryan Reid, Kayley Gunner, Mick Blue Surprise Dick For Their Anniversary / 22.3.2026" + ) + .is_some()); + assert!( + Pornhd3xProvider::parse_uploaded_at( + "New Dana Vespoli Kimmy Kimm A Brand Nude Incentive (22-03-2026)" + ) + .is_some() + ); + } +} diff --git a/src/providers/pornmz.rs b/src/providers/pornmz.rs new file mode 100644 index 0000000..1fa5101 --- /dev/null +++ b/src/providers/pornmz.rs @@ -0,0 +1,1241 @@ +use crate::DbPool; +use crate::api::ClientVersion; +use crate::providers::{ + Provider, report_provider_error, report_provider_error_background, requester_or_default, +}; +use crate::status::*; +use crate::util::cache::VideoCache; +use crate::util::parse_abbreviated_number; +use crate::util::requester::Requester; +use crate::util::time::parse_time_to_seconds; +use crate::videos::{ServerOptions, VideoFormat, VideoItem}; +use async_trait::async_trait; +use chrono::{DateTime, Utc}; +use error_chain::error_chain; +use futures::stream::{self, StreamExt}; +use htmlentity::entity::{ICodedDataTrait, decode}; +use regex::Regex; +use scraper::{ElementRef, Html, Selector}; +use std::sync::{Arc, RwLock}; +use std::{thread, vec}; +use url::Url; +use wreq::Version; + +pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata = + crate::providers::ProviderChannelMetadata { + group_id: "mainstream-tube", + tags: &["tube", "studios", "actors"], + }; + +const BASE_URL: &str = "https://pornmz.com"; +const CHANNEL_ID: &str = "pornmz"; +const FIREFOX_UA: &str = + "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0"; +const HTML_ACCEPT: &str = + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"; + +error_chain! { + foreign_links { + Io(std::io::Error); + } + errors { + Parse(msg: String) { + description("parse error") + display("parse error: {}", msg) + } + } +} + +#[derive(Debug, Clone)] +pub struct PornmzProvider { + url: String, + categories: Arc>>, + tags: Arc>>, + uploaders: Arc>>, +} + +#[derive(Debug, Clone)] +enum Target { + Home, + Search(String), + Archive(String), +} + +impl PornmzProvider { + pub fn new() -> Self { + let provider = Self { + url: BASE_URL.to_string(), + categories: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + tags: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + uploaders: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + }; + provider.spawn_initial_load(); + provider + } + + fn spawn_initial_load(&self) { + let url = self.url.clone(); + let categories = Arc::clone(&self.categories); + let tags = Arc::clone(&self.tags); + let uploaders = Arc::clone(&self.uploaders); + + thread::spawn(move || { + let runtime = match tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + { + Ok(runtime) => runtime, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "spawn_initial_load.runtime_build", + &error.to_string(), + ); + return; + } + }; + + runtime.block_on(async move { + if let Err(error) = Self::load_categories(&url, Arc::clone(&categories)).await { + report_provider_error_background( + CHANNEL_ID, + "load_categories", + &error.to_string(), + ); + } + if let Err(error) = Self::load_tags(&url, Arc::clone(&tags)).await { + report_provider_error_background(CHANNEL_ID, "load_tags", &error.to_string()); + } + if let Err(error) = Self::load_uploaders(&url, Arc::clone(&uploaders)).await { + report_provider_error_background( + CHANNEL_ID, + "load_uploaders", + &error.to_string(), + ); + } + }); + }); + } + + fn build_channel(&self, _clientversion: ClientVersion) -> Channel { + let categories = self + .categories + .read() + .map(|value| value.clone()) + .unwrap_or_default(); + let tags = self.tags.read().map(|value| value.clone()).unwrap_or_default(); + let uploaders = self + .uploaders + .read() + .map(|value| value.clone()) + .unwrap_or_default(); + + Channel { + id: CHANNEL_ID.to_string(), + name: "Pornmz".to_string(), + description: + "Pornmz videos with latest, best, most-viewed, longest, random, category, tag, and actor archives." + .to_string(), + premium: false, + favicon: "https://www.google.com/s2/favicons?sz=64&domain=pornmz.com".to_string(), + status: "active".to_string(), + categories: categories.iter().map(|value| value.title.clone()).collect(), + options: vec![ + ChannelOption { + id: "sort".to_string(), + title: "Sort".to_string(), + description: "Browse Pornmz archives by ranking.".to_string(), + systemImage: "list.number".to_string(), + colorName: "blue".to_string(), + options: vec![ + FilterOption { + id: "latest".to_string(), + title: "Newest".to_string(), + }, + FilterOption { + id: "popular".to_string(), + title: "Best".to_string(), + }, + FilterOption { + id: "most-viewed".to_string(), + title: "Most Viewed".to_string(), + }, + FilterOption { + id: "longest".to_string(), + title: "Longest".to_string(), + }, + FilterOption { + id: "random".to_string(), + title: "Random".to_string(), + }, + ], + multiSelect: false, + }, + ChannelOption { + id: "categories".to_string(), + title: "Categories".to_string(), + description: "Browse a Pornmz category archive.".to_string(), + systemImage: "square.grid.2x2".to_string(), + colorName: "orange".to_string(), + options: categories, + multiSelect: false, + }, + ChannelOption { + id: "filter".to_string(), + title: "Tags".to_string(), + description: "Browse a Pornmz tag archive.".to_string(), + systemImage: "tag.fill".to_string(), + colorName: "green".to_string(), + options: tags, + multiSelect: false, + }, + ChannelOption { + id: "sites".to_string(), + title: "Actors".to_string(), + description: "Browse a Pornmz actor archive.".to_string(), + systemImage: "person.crop.square".to_string(), + colorName: "purple".to_string(), + options: uploaders, + multiSelect: false, + }, + ], + nsfw: true, + cacheDuration: Some(1800), + } + } + + fn selector(value: &str) -> Result { + Selector::parse(value) + .map_err(|error| Error::from(format!("selector `{value}` parse failed: {error}"))) + } + + fn regex(value: &str) -> Result { + Regex::new(value).map_err(|error| Error::from(format!("regex `{value}` failed: {error}"))) + } + + fn decode_html(text: &str) -> String { + decode(text.as_bytes()) + .to_string() + .unwrap_or_else(|_| text.to_string()) + } + + fn collapse_whitespace(text: &str) -> String { + text.split_whitespace().collect::>().join(" ") + } + + fn text_of(element: &ElementRef<'_>) -> String { + Self::decode_html(&Self::collapse_whitespace( + &element.text().collect::>().join(" "), + )) + } + + fn normalize_title(title: &str) -> String { + title + .trim() + .trim_start_matches('#') + .split_whitespace() + .collect::>() + .join(" ") + .to_ascii_lowercase() + } + + fn normalize_url(&self, url: &str) -> String { + if url.is_empty() { + return String::new(); + } + if url.starts_with("http://") || url.starts_with("https://") { + return url.to_string(); + } + if url.starts_with("//") { + return format!("https:{url}"); + } + if url.starts_with('?') { + return format!("{}{url}", self.url); + } + if url.starts_with('/') { + return format!("{}{}", self.url, url); + } + format!("{}/{}", self.url, url.trim_start_matches("./")) + } + + fn html_headers(referer: &str) -> Vec<(String, String)> { + vec![ + ("User-Agent".to_string(), FIREFOX_UA.to_string()), + ("Accept".to_string(), HTML_ACCEPT.to_string()), + ("Referer".to_string(), referer.to_string()), + ] + } + + async fn fetch_html(requester: &mut Requester, url: &str, referer: &str) -> Result { + requester + .get_with_headers(url, Self::html_headers(referer), Some(Version::HTTP_11)) + .await + .map_err(|error| Error::from(format!("request failed for {url}: {error}"))) + } + + fn discover_last_page(html: &str) -> u16 { + let Ok(re) = Self::regex(r#"/page/([0-9]+)"#) else { + return 1; + }; + re.captures_iter(html) + .filter_map(|caps| caps.get(1).and_then(|value| value.as_str().parse::().ok())) + .max() + .unwrap_or(1) + } + + fn push_unique(target: &Arc>>, item: FilterOption) { + if item.id.is_empty() || item.title.is_empty() { + return; + } + let normalized = Self::normalize_title(&item.title); + if normalized.is_empty() { + return; + } + + if let Ok(mut values) = target.write() { + if values.iter().any(|value| { + value.id == item.id || Self::normalize_title(&value.title) == normalized + }) { + return; + } + values.push(item); + } + } + + fn clean_filter_title(title: &str) -> String { + let mut parts = Self::decode_html(title) + .trim() + .trim_start_matches('#') + .split_whitespace() + .map(ToOwned::to_owned) + .collect::>(); + + if parts.len() > 1 + && parts + .last() + .is_some_and(|value| value.chars().all(|ch| ch.is_ascii_digit())) + { + parts.pop(); + } + + parts.join(" ").trim().to_string() + } + + fn humanize_slug(slug: &str) -> String { + slug.split('-') + .filter(|part| !part.is_empty()) + .map(|part| { + if part.chars().all(|ch| ch.is_ascii_digit()) { + return part.to_string(); + } + let mut chars = part.chars(); + match chars.next() { + Some(first) => { + let mut value = first.to_uppercase().collect::(); + value.push_str(chars.as_str()); + value + } + None => String::new(), + } + }) + .collect::>() + .join(" ") + } + + fn merge_tag(target: &mut Vec, value: String) { + let normalized = Self::normalize_title(&value); + if normalized.is_empty() { + return; + } + if target + .iter() + .any(|existing| Self::normalize_title(existing) == normalized) + { + return; + } + target.push(value); + } + + fn tags_from_card(card: &ElementRef<'_>) -> Vec { + let mut tags = Vec::new(); + for class_name in card.value().classes() { + let slug = class_name + .strip_prefix("tag-") + .or_else(|| class_name.strip_prefix("category-")) + .or_else(|| class_name.strip_prefix("actors-")); + let Some(slug) = slug else { + continue; + }; + if slug.chars().all(|ch| ch.is_ascii_digit()) { + continue; + } + Self::merge_tag(&mut tags, Self::humanize_slug(slug)); + } + tags + } + + async fn load_categories(base_url: &str, categories: Arc>>) -> Result<()> { + let mut requester = Requester::new(); + let first_url = format!("{base_url}/categories"); + let first_html = Self::fetch_html(&mut requester, &first_url, base_url).await?; + let max_pages = Self::discover_last_page(&first_html).max(1); + + for page in 1..=max_pages { + let url = if page == 1 { + first_url.clone() + } else { + format!("{base_url}/categories/page/{page}") + }; + let html = if page == 1 { + first_html.clone() + } else { + Self::fetch_html(&mut requester, &url, base_url).await? + }; + let document = Html::parse_document(&html); + let selector = Self::selector("a[href*=\"/pmvideo/c/\"]")?; + for element in document.select(&selector) { + let href = element.value().attr("href").unwrap_or_default(); + let title = Self::clean_filter_title(&Self::text_of(&element)); + if title.is_empty() { + continue; + } + let normalized = if href.starts_with("http") { + href.to_string() + } else { + format!( + "{base_url}/{}", + href.trim_start_matches('/').trim_end_matches('/') + ) + }; + Self::push_unique( + &categories, + FilterOption { + id: normalized, + title, + }, + ); + } + } + + Ok(()) + } + + async fn load_tags(base_url: &str, tags: Arc>>) -> Result<()> { + let mut requester = Requester::new(); + let html = Self::fetch_html(&mut requester, &format!("{base_url}/tags"), base_url).await?; + let document = Html::parse_document(&html); + let selector = Self::selector("a[href*=\"/pmvideo/s/\"]")?; + + for element in document.select(&selector) { + let href = element.value().attr("href").unwrap_or_default(); + let title = Self::clean_filter_title(&Self::text_of(&element)); + if title.is_empty() { + continue; + } + let normalized = if href.starts_with("http") { + href.to_string() + } else { + format!( + "{base_url}/{}", + href.trim_start_matches('/').trim_end_matches('/') + ) + }; + Self::push_unique( + &tags, + FilterOption { + id: normalized, + title, + }, + ); + } + + Ok(()) + } + + fn canonical_actor_url(base_url: &str, href: &str) -> String { + if let Ok(url) = Url::parse(href) { + if let Some((_, slug)) = url.query_pairs().find(|(key, _)| key == "actors") { + return format!("{base_url}/video/id=pmactor/{}", slug.trim()); + } + } + if let Some(slug) = href.split("actors=").nth(1) { + return format!("{base_url}/video/id=pmactor/{}", slug.trim()); + } + if href.starts_with("http://") || href.starts_with("https://") { + return href.to_string(); + } + if href.starts_with('/') { + return format!("{base_url}{href}"); + } + format!("{base_url}/{}", href.trim_start_matches("./")) + } + + async fn load_uploaders(base_url: &str, uploaders: Arc>>) -> Result<()> { + let mut requester = Requester::new(); + let first_url = format!("{base_url}/actors"); + let first_html = Self::fetch_html(&mut requester, &first_url, base_url).await?; + let max_pages = Self::discover_last_page(&first_html).max(1); + + for page in 1..=max_pages { + let url = if page == 1 { + first_url.clone() + } else { + format!("{base_url}/actors/page/{page}") + }; + let html = if page == 1 { + first_html.clone() + } else { + Self::fetch_html(&mut requester, &url, base_url).await? + }; + let document = Html::parse_document(&html); + let selector = Self::selector("article.thumb-block a[href*=\"actors=\"]")?; + + for element in document.select(&selector) { + let href = element.value().attr("href").unwrap_or_default(); + let title = element + .value() + .attr("title") + .map(Self::decode_html) + .filter(|value| !value.is_empty()) + .unwrap_or_else(|| Self::text_of(&element)); + if title.is_empty() { + continue; + } + Self::push_unique( + &uploaders, + FilterOption { + id: Self::canonical_actor_url(base_url, href), + title, + }, + ); + } + } + + Ok(()) + } + + fn search_filters_need_refresh(&self) -> bool { + let categories_len = self + .categories + .read() + .map(|values| values.len()) + .unwrap_or_default(); + let tags_len = self.tags.read().map(|values| values.len()).unwrap_or_default(); + + categories_len <= 1 || tags_len <= 1 + } + + async fn refresh_search_filters(&self) { + if let Err(error) = Self::load_categories(&self.url, Arc::clone(&self.categories)).await { + report_provider_error_background( + CHANNEL_ID, + "refresh_search_filters.categories", + &error.to_string(), + ); + } + if let Err(error) = Self::load_tags(&self.url, Arc::clone(&self.tags)).await { + report_provider_error_background( + CHANNEL_ID, + "refresh_search_filters.tags", + &error.to_string(), + ); + } + } + + fn match_filter(options: &[FilterOption], query: &str) -> Option { + let normalized_query = Self::normalize_title(query); + options + .iter() + .find(|value| { + value.id != "all" && Self::normalize_title(&value.title) == normalized_query + }) + .map(|value| value.id.clone()) + } + + fn resolve_option_target(&self, options: &ServerOptions) -> Target { + if let Some(actor) = options.sites.as_deref() { + if actor.starts_with(&self.url) && actor != "all" { + return Target::Archive(actor.to_string()); + } + } + if let Some(tag) = options.filter.as_deref() { + if tag.starts_with(&self.url) && tag != "all" { + return Target::Archive(tag.to_string()); + } + } + if let Some(category) = options.categories.as_deref() { + if category.starts_with(&self.url) && category != "all" { + return Target::Archive(category.to_string()); + } + } + Target::Home + } + + fn resolve_query_target(&self, query: &str) -> Target { + if let Ok(uploaders) = self.uploaders.read() { + if let Some(value) = Self::match_filter(&uploaders, query) { + return Target::Archive(value); + } + } + if let Ok(tags) = self.tags.read() { + if let Some(value) = Self::match_filter(&tags, query) { + return Target::Archive(value); + } + } + if let Ok(categories) = self.categories.read() { + if let Some(value) = Self::match_filter(&categories, query) { + return Target::Archive(value); + } + } + Target::Search(query.to_string()) + } + + fn slugify_query(query: &str) -> String { + let mut slug = String::new(); + let mut last_dash = false; + for ch in query.chars().flat_map(|ch| ch.to_lowercase()) { + if ch.is_ascii_alphanumeric() { + slug.push(ch); + last_dash = false; + } else if !last_dash { + slug.push('-'); + last_dash = true; + } + } + slug.trim_matches('-').to_string() + } + + async fn guess_actor_archive( + &self, + query: &str, + options: &ServerOptions, + ) -> Option { + let slug = Self::slugify_query(query); + if slug.is_empty() { + return None; + } + let archive_url = format!("{}/video/id=pmactor/{}", self.url, slug); + let mut requester = + requester_or_default(options, CHANNEL_ID, "pornmz.guess_actor_archive.missing_requester"); + let html = Self::fetch_html(&mut requester, &archive_url, &archive_url) + .await + .ok()?; + let document = Html::parse_document(&html); + let page_title = document + .select(&Self::selector("title").ok()?) + .next() + .map(|value| Self::text_of(&value)) + .unwrap_or_default(); + let normalized_query = Self::normalize_title(query); + if !Self::normalize_title(&page_title).contains(&normalized_query) { + return None; + } + let items = self.parse_list_videos(&html).ok()?; + (!items.is_empty()).then_some(archive_url) + } + + fn sort_filter(sort: &str) -> &'static str { + match sort { + "popular" | "best" => "popular", + "most-viewed" | "viewed" | "trending" => "most-viewed", + "longest" | "duration" => "longest", + "random" => "random", + _ => "latest", + } + } + + fn add_sort(base: &str, sort: &str) -> String { + let mut url = Url::parse(base).unwrap_or_else(|_| Url::parse(BASE_URL).unwrap()); + url.query_pairs_mut() + .append_pair("filter", Self::sort_filter(sort)); + url.to_string() + } + + fn build_target_url(&self, target: &Target, page: u16, sort: &str) -> String { + let base = match target { + Target::Home => format!("{}/", self.url), + Target::Search(query) => { + let mut url = Url::parse(&format!("{}/", self.url)).unwrap(); + url.query_pairs_mut().append_pair("s", query); + url.to_string() + } + Target::Archive(url) => url.clone(), + }; + + let with_sort = Self::add_sort(&base, sort); + let mut parsed = Url::parse(&with_sort).unwrap_or_else(|_| Url::parse(&base).unwrap()); + if page > 1 { + let path = parsed.path().trim_end_matches('/'); + let new_path = if path.is_empty() { + format!("/page/{page}") + } else { + format!("{path}/page/{page}") + }; + parsed.set_path(&new_path); + } + parsed.to_string() + } + + fn parse_duration(text: &str) -> u32 { + parse_time_to_seconds(text) + .and_then(|value| u32::try_from(value).ok()) + .unwrap_or(0) + } + + fn parse_views(text: &str) -> Option { + parse_abbreviated_number(text.trim()) + } + + fn parse_iso8601_duration(text: &str) -> Option { + let re = Self::regex(r#"P(?:\d+D)?T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?"#).ok()?; + let caps = re.captures(text)?; + let hours = caps + .get(1) + .and_then(|value| value.as_str().parse::().ok()) + .unwrap_or(0); + let minutes = caps + .get(2) + .and_then(|value| value.as_str().parse::().ok()) + .unwrap_or(0); + let seconds = caps + .get(3) + .and_then(|value| value.as_str().parse::().ok()) + .unwrap_or(0); + Some(hours * 3600 + minutes * 60 + seconds) + } + + fn parse_uploaded_at(text: &str) -> Option { + DateTime::parse_from_rfc3339(text) + .ok() + .map(|value| value.with_timezone(&Utc).timestamp() as u64) + } + + fn media_extension(url: &str) -> String { + Url::parse(url) + .ok() + .and_then(|value| { + value + .path_segments() + .and_then(|mut segments| segments.next_back().map(str::to_string)) + }) + .and_then(|segment| segment.rsplit('.').next().map(str::to_string)) + .filter(|ext| !ext.is_empty() && !ext.contains('/')) + .unwrap_or_else(|| "auto".to_string()) + } + + fn list_container<'a>(&self, document: &'a Html) -> Result>> { + for selector_text in [ + "div.videos-list", + "div.posts", + "main.site-main", + "div.content-area", + ] { + let selector = Self::selector(selector_text)?; + if let Some(element) = document.select(&selector).next() { + return Ok(Some(element)); + } + } + Ok(None) + } + + fn parse_list_videos(&self, html: &str) -> Result> { + let document = Html::parse_document(html); + let Some(container) = self.list_container(&document)? else { + return Ok(vec![]); + }; + + let card_selector = Self::selector("article.thumb-block")?; + let link_selector = Self::selector("a[href*=\"/video/id=\"]")?; + let thumb_selector = Self::selector("img.video-main-thumb")?; + let title_selector = Self::selector("span.title")?; + let duration_selector = Self::selector("span.duration")?; + let views_selector = Self::selector("span.views")?; + + let mut items = Vec::new(); + for card in container.select(&card_selector) { + let Some(link) = card.select(&link_selector).next() else { + continue; + }; + let detail_url = self.normalize_url(link.value().attr("href").unwrap_or_default()); + if detail_url.is_empty() { + continue; + } + let id = detail_url + .rsplit("id=") + .next() + .unwrap_or_default() + .trim() + .to_string(); + if id.is_empty() || id.starts_with("pmactor/") { + continue; + } + + let title = card + .select(&title_selector) + .next() + .map(|value| Self::text_of(&value)) + .filter(|value| !value.is_empty()) + .or_else(|| { + link.value() + .attr("title") + .map(Self::decode_html) + .filter(|value| !value.is_empty()) + }); + let Some(title) = title else { + continue; + }; + + let thumb = card + .select(&thumb_selector) + .next() + .and_then(|value| value.value().attr("src")) + .map(|value| self.normalize_url(value)) + .unwrap_or_else(|| { + self.normalize_url(card.value().attr("data-main-thumb").unwrap_or_default()) + }); + + let preview = self.normalize_url(card.value().attr("data-trailer").unwrap_or_default()); + let duration = card + .select(&duration_selector) + .next() + .map(|value| Self::parse_duration(&Self::text_of(&value))) + .unwrap_or(0); + let views = card + .select(&views_selector) + .next() + .map(|value| Self::text_of(&value)) + .and_then(|value| Self::parse_views(&value)); + + let mut item = + VideoItem::new(id, title, detail_url, CHANNEL_ID.to_string(), thumb, duration); + if let Some(views) = views { + item.views = Some(views); + } + if !preview.is_empty() { + item.preview = Some(preview); + } + let class_tags = Self::tags_from_card(&card); + if !class_tags.is_empty() { + item.tags = Some(class_tags); + } + items.push(item); + } + + Ok(items) + } + + fn meta_itemprop(document: &Html, itemprop: &str) -> Option { + let selector = Self::selector(&format!("meta[itemprop=\"{itemprop}\"]")).ok()?; + document + .select(&selector) + .next() + .and_then(|value| value.value().attr("content")) + .map(Self::decode_html) + } + + fn label_entries(document: &Html) -> Result> { + let selector = Self::selector("div.video-tags a.label[href]")?; + let icon_selector = Self::selector("i")?; + let mut values = Vec::new(); + + for element in document.select(&selector) { + let href = element.value().attr("href").unwrap_or_default().to_string(); + let title = Self::text_of(&element); + let kind = element + .select(&icon_selector) + .next() + .and_then(|value| value.value().attr("class")) + .unwrap_or_default() + .to_string(); + if !href.is_empty() && !title.is_empty() { + values.push((href, title, kind)); + } + } + + Ok(values) + } + + fn apply_detail_video(&self, mut item: VideoItem, html: &str) -> Result { + let document = Html::parse_document(html); + + if let Some(title) = document + .select(&Self::selector("div.video-infos h1, h1[itemprop=\"name\"]")?) + .next() + .map(|value| Self::text_of(&value)) + .filter(|value| !value.is_empty()) + .or_else(|| Self::meta_itemprop(&document, "name").filter(|value| !value.is_empty())) + { + item.title = title; + } + + if let Some(url) = Self::meta_itemprop(&document, "contentURL").filter(|value| !value.is_empty()) { + item.url = self.normalize_url(&url); + let ext = Self::media_extension(&item.url); + item.formats = Some(vec![VideoFormat::new( + item.url.clone(), + "auto".to_string(), + ext, + )]); + } + + if let Some(thumb) = Self::meta_itemprop(&document, "thumbnailUrl").filter(|value| !value.is_empty()) { + item.thumb = self.normalize_url(&thumb); + } + + if let Some(duration) = Self::meta_itemprop(&document, "duration") + .and_then(|value| Self::parse_iso8601_duration(&value)) + { + item.duration = duration; + } + + if let Some(uploaded_at) = Self::meta_itemprop(&document, "uploadDate") + .and_then(|value| Self::parse_uploaded_at(&value)) + { + item.uploadedAt = Some(uploaded_at); + } + + let views_selector = Self::selector("div.video-infos span.views")?; + if let Some(views) = document + .select(&views_selector) + .next() + .map(|value| Self::text_of(&value)) + .and_then(|value| Self::parse_views(&value)) + { + item.views = Some(views); + } + + let mut tags = item.tags.take().unwrap_or_default(); + let mut uploader = None; + let mut uploader_url = None; + for (href, title, kind) in Self::label_entries(&document)? { + let normalized_title = Self::normalize_title(&title); + if normalized_title.is_empty() { + continue; + } + Self::merge_tag(&mut tags, title.clone()); + if uploader.is_none() && kind.contains("fa-star") { + uploader = Some(title); + uploader_url = Some(self.normalize_url(&href)); + } + } + + if let Some(uploader) = uploader { + item.uploader = Some(uploader); + } + if let Some(uploader_url) = uploader_url.filter(|value| !value.is_empty()) { + item.uploaderUrl = Some(uploader_url); + } + if !tags.is_empty() { + item.tags = Some(tags); + } + + Ok(item) + } + + async fn enrich_item(&self, item: VideoItem, options: &ServerOptions) -> VideoItem { + let mut requester = + requester_or_default(options, CHANNEL_ID, "pornmz.enrich_item.missing_requester"); + match Self::fetch_html(&mut requester, &item.url, &item.url).await { + Ok(html) => match self.apply_detail_video(item.clone(), &html) { + Ok(value) => value, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "enrich_item.apply_detail_video", + &format!("url={}; error={error}", item.url), + ); + item + } + }, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "enrich_item.fetch_html", + &format!("url={}; error={error}", item.url), + ); + item + } + } + } + + async fn fetch_target( + &self, + cache: VideoCache, + target: Target, + page: u16, + sort: &str, + options: ServerOptions, + ) -> Result> { + let url = self.build_target_url(&target, page, sort); + let old_items = match cache.get(&url) { + Some((time, items)) => { + if time.elapsed().unwrap_or_default().as_secs() < 60 * 5 { + return Ok(items.clone()); + } + items.clone() + } + None => vec![], + }; + + let mut requester = + requester_or_default(&options, CHANNEL_ID, "pornmz.fetch_target.missing_requester"); + let html = match Self::fetch_html(&mut requester, &url, BASE_URL).await { + Ok(value) => value, + Err(error) => { + report_provider_error( + CHANNEL_ID, + "fetch_target.request", + &format!("url={url}; error={error}"), + ) + .await; + return Ok(old_items); + } + }; + + if html.trim().is_empty() { + report_provider_error(CHANNEL_ID, "fetch_target.empty_response", &format!("url={url}")) + .await; + return Ok(old_items); + } + + let items = self.parse_list_videos(&html)?; + if items.is_empty() { + return Ok(old_items); + } + + let enriched = stream::iter(items.into_iter().map(|item| { + let provider = self.clone(); + let options = options.clone(); + async move { provider.enrich_item(item, &options).await } + })) + .buffer_unordered(6) + .collect::>() + .await; + + cache.remove(&url); + cache.insert(url, enriched.clone()); + Ok(enriched) + } +} + +#[async_trait] +impl Provider for PornmzProvider { + async fn get_videos( + &self, + cache: VideoCache, + _pool: DbPool, + sort: String, + query: Option, + page: String, + _per_page: String, + options: ServerOptions, + ) -> Vec { + let page = page.parse::().unwrap_or(1).max(1); + + let target = match query { + Some(query) if !query.trim().is_empty() => { + let query = query.trim(); + let mut target = self.resolve_query_target(query); + if matches!(target, Target::Search(_)) { + if let Some(archive) = self.guess_actor_archive(query, &options).await { + target = Target::Archive(archive); + } + } + if matches!(target, Target::Search(_)) && self.search_filters_need_refresh() { + self.refresh_search_filters().await; + target = self.resolve_query_target(query); + if matches!(target, Target::Search(_)) { + if let Some(archive) = self.guess_actor_archive(query, &options).await { + target = Target::Archive(archive); + } + } + } + target + } + _ => self.resolve_option_target(&options), + }; + + match self + .fetch_target(cache, target, page, &sort, options.clone()) + .await + { + Ok(items) => items, + Err(error) => { + report_provider_error( + CHANNEL_ID, + "get_videos.fetch_target", + &format!("sort={sort}; page={page}; error={error}"), + ) + .await; + vec![] + } + } + } + + fn get_channel(&self, clientversion: ClientVersion) -> Option { + Some(self.build_channel(clientversion)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_provider() -> PornmzProvider { + PornmzProvider { + url: BASE_URL.to_string(), + categories: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + tags: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + uploaders: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + } + } + + #[test] + fn search_filters_ignore_missing_uploaders() { + let provider = PornmzProvider { + url: BASE_URL.to_string(), + categories: Arc::new(RwLock::new(vec![ + FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }, + FilterOption { + id: "https://pornmz.com/pmvideo/c/brazzers".to_string(), + title: "Brazzers".to_string(), + }, + ])), + tags: Arc::new(RwLock::new(vec![ + FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }, + FilterOption { + id: "https://pornmz.com/pmvideo/s/blonde".to_string(), + title: "Blonde".to_string(), + }, + ])), + uploaders: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + }; + + assert!(!provider.search_filters_need_refresh()); + } + + #[test] + fn builds_search_page_url() { + let provider = test_provider(); + let url = provider.build_target_url(&Target::Search("brazzers".to_string()), 2, "most-viewed"); + assert_eq!(url, "https://pornmz.com/page/2?s=brazzers&filter=most-viewed"); + } + + #[test] + fn canonicalizes_actor_urls() { + assert_eq!( + PornmzProvider::canonical_actor_url( + BASE_URL, + "https://pornmz.com?actors=kayley-gunner" + ), + "https://pornmz.com/video/id=pmactor/kayley-gunner" + ); + } + + #[test] + fn parses_list_card() { + let provider = test_provider(); + let html = r#" + + "#; + let items = provider.parse_list_videos(html).expect("items"); + assert_eq!(items.len(), 1); + assert_eq!(items[0].id, "pm123"); + assert_eq!(items[0].duration, 1740); + assert_eq!(items[0].views, Some(1200)); + assert_eq!(items[0].preview.as_deref(), Some("https://pornmz.com/preview.mp4")); + assert!(items[0].tags.as_ref().is_some_and(|values| values.iter().any(|value| value == "Blonde"))); + } + + #[test] + fn applies_detail_media_and_labels() { + let provider = test_provider(); + let item = VideoItem::new( + "pm123".to_string(), + "Old title".to_string(), + "https://pornmz.com/video/id=pm123".to_string(), + CHANNEL_ID.to_string(), + "https://pornmz.com/thumb.jpg".to_string(), + 0, + ); + let html = r#" +
+ + + + + +
1.4K
+ +
+ "#; + let item = provider.apply_detail_video(item, html).expect("detail"); + assert_eq!(item.title, "Real Title"); + assert_eq!(item.url, "https://cdn.example/master.m3u8"); + assert_eq!(item.formats.as_ref().and_then(|values| values.first()).map(|value| value.format.clone()).as_deref(), Some("m3u8")); + assert_eq!(item.duration, 1740); + assert_eq!(item.views, Some(1400)); + assert_eq!(item.uploader.as_deref(), Some("Kayley Gunner")); + assert_eq!( + item.uploaderUrl.as_deref(), + Some("https://pornmz.com/video/id=pmactor/kayley-gunner") + ); + assert!(item.tags.as_ref().is_some_and(|values| values.iter().any(|value| value == "Brazzers"))); + assert!(item.formats.is_some()); + assert!(item.uploadedAt.is_some()); + } + + #[test] + fn derives_mp4_media_extension() { + assert_eq!( + PornmzProvider::media_extension("https://cdn.example/video.mp4?token=1"), + "mp4" + ); + } +} diff --git a/src/providers/sextb.rs b/src/providers/sextb.rs new file mode 100644 index 0000000..a1daf4b --- /dev/null +++ b/src/providers/sextb.rs @@ -0,0 +1,1210 @@ +use crate::DbPool; +use crate::api::ClientVersion; +use crate::providers::{ + Provider, report_provider_error, report_provider_error_background, requester_or_default, +}; +use crate::status::*; +use crate::util::cache::VideoCache; +use crate::util::parse_abbreviated_number; +use crate::util::requester::Requester; +use crate::videos::{ServerOptions, VideoItem}; +use async_trait::async_trait; +use chrono::{DateTime, Duration as ChronoDuration, NaiveDate, Utc}; +use error_chain::error_chain; +use futures::stream::{self, StreamExt}; +use regex::Regex; +use std::sync::{Arc, RwLock}; +use std::time::Duration as StdDuration; +use std::{thread, vec}; +use tokio::time::timeout; +use url::Url; +use wreq::Version; + +pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata = + crate::providers::ProviderChannelMetadata { + group_id: "jav", + tags: &["jav", "asian", "metadata"], + }; + +error_chain! { + foreign_links { + Io(std::io::Error); + } + errors { + Parse(msg: String) { + description("parse error") + display("parse error: {}", msg) + } + } +} + +const BASE_URL: &str = "https://sextb.net"; +const MIRROR_PREFIX: &str = "https://r.jina.ai/http://"; +const CHANNEL_ID: &str = "sextb"; + +#[derive(Debug, Clone)] +pub struct SextbProvider { + url: String, + genres: Arc>>, + studios: Arc>>, + actresses: Arc>>, +} + +#[derive(Debug, Clone)] +enum Target { + Section { + section: String, + sort: String, + }, + Search { + slug: String, + }, + Genre { + slug: String, + }, + Studio { + slug: String, + }, + Actress { + slug: String, + }, +} + +#[derive(Debug, Clone, Default)] +struct ListItemSeed { + id: String, + title: String, + url: String, + thumb: String, + duration: u32, + quality: Option, + code: Option, + tags: Vec, +} + +#[derive(Debug, Clone, Default)] +struct DetailMetadata { + aliases: Vec, + director: Option, + label: Option, + studio: Option<(String, String)>, + cast: Vec<(String, String)>, + genres: Vec<(String, String)>, + quality: Option, + release_date: Option, + added_at: Option, + views: Option, + rating: Option, +} + +impl SextbProvider { + pub fn new() -> Self { + let provider = Self { + url: BASE_URL.to_string(), + genres: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + studios: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + actresses: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + }; + provider.spawn_initial_load(); + provider + } + + fn spawn_initial_load(&self) { + let url = self.url.clone(); + let genres = Arc::clone(&self.genres); + let studios = Arc::clone(&self.studios); + let actresses = Arc::clone(&self.actresses); + + thread::spawn(move || { + let runtime = match tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + { + Ok(runtime) => runtime, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "spawn_initial_load.runtime_build", + &error.to_string(), + ); + return; + } + }; + + runtime.block_on(async move { + if let Err(error) = Self::load_genres(&url, Arc::clone(&genres)).await { + report_provider_error_background(CHANNEL_ID, "load_genres", &error.to_string()); + } + if let Err(error) = Self::load_studios(&url, Arc::clone(&studios)).await { + report_provider_error_background( + CHANNEL_ID, + "load_studios", + &error.to_string(), + ); + } + if let Err(error) = Self::load_actresses(&url, Arc::clone(&actresses)).await { + report_provider_error_background( + CHANNEL_ID, + "load_actresses", + &error.to_string(), + ); + } + }); + }); + } + + fn build_channel(&self, _clientversion: ClientVersion) -> Channel { + let genres = self.genres.read().map(|value| value.clone()).unwrap_or_default(); + let studios = self.studios.read().map(|value| value.clone()).unwrap_or_default(); + let actresses = self + .actresses + .read() + .map(|value| value.clone()) + .unwrap_or_default(); + + Channel { + id: CHANNEL_ID.to_string(), + name: "SEXTB".to_string(), + description: + "JAV archive pages with section, genre, studio, actress, and rich detail metadata." + .to_string(), + premium: false, + favicon: "https://www.google.com/s2/favicons?sz=64&domain=sextb.net".to_string(), + status: "active".to_string(), + categories: genres.iter().map(|value| value.title.clone()).collect(), + options: vec![ + ChannelOption { + id: "sort".to_string(), + title: "Sort".to_string(), + description: "Sort a section archive page.".to_string(), + systemImage: "list.number".to_string(), + colorName: "blue".to_string(), + options: vec![ + FilterOption { + id: "latest".to_string(), + title: "Latest".to_string(), + }, + FilterOption { + id: "release".to_string(), + title: "New Releases".to_string(), + }, + FilterOption { + id: "liked".to_string(), + title: "Most Liked".to_string(), + }, + FilterOption { + id: "viewed".to_string(), + title: "Most Viewed".to_string(), + }, + FilterOption { + id: "favorite".to_string(), + title: "Most Favorite".to_string(), + }, + ], + multiSelect: false, + }, + ChannelOption { + id: "filter".to_string(), + title: "Section".to_string(), + description: "Browse one of the main site sections.".to_string(), + systemImage: "square.grid.2x2".to_string(), + colorName: "green".to_string(), + options: vec![ + FilterOption { + id: "censored".to_string(), + title: "Censored".to_string(), + }, + FilterOption { + id: "uncensored".to_string(), + title: "Uncensored".to_string(), + }, + FilterOption { + id: "subtitle".to_string(), + title: "Subtitle".to_string(), + }, + FilterOption { + id: "amateur".to_string(), + title: "Amateur".to_string(), + }, + ], + multiSelect: false, + }, + ChannelOption { + id: "categories".to_string(), + title: "Genres".to_string(), + description: "Browse direct genre archive pages.".to_string(), + systemImage: "tag.fill".to_string(), + colorName: "orange".to_string(), + options: genres, + multiSelect: false, + }, + ChannelOption { + id: "sites".to_string(), + title: "Studios".to_string(), + description: "Browse direct studio archive pages.".to_string(), + systemImage: "building.2.fill".to_string(), + colorName: "purple".to_string(), + options: studios, + multiSelect: false, + }, + ChannelOption { + id: "stars".to_string(), + title: "Actresses".to_string(), + description: "Browse direct actress archive pages.".to_string(), + systemImage: "person.crop.square".to_string(), + colorName: "pink".to_string(), + options: actresses, + multiSelect: false, + }, + ], + nsfw: true, + cacheDuration: Some(1800), + } + } + + fn regex(value: &str) -> Result { + Regex::new(value).map_err(|error| Error::from(format!("regex `{value}` failed: {error}"))) + } + + fn collapse_whitespace(text: &str) -> String { + text.split_whitespace().collect::>().join(" ") + } + + fn normalize_title(text: &str) -> String { + Self::collapse_whitespace(text) + .trim_matches('"') + .trim() + .to_string() + } + + fn mirror_url(&self, url: &str) -> String { + format!("{MIRROR_PREFIX}{url}") + } + + fn slugify(value: &str) -> String { + let mut output = String::new(); + let mut needs_dash = false; + + for ch in value.chars() { + if ch.is_ascii_alphanumeric() { + if needs_dash && !output.is_empty() { + output.push('-'); + } + output.push(ch.to_ascii_lowercase()); + needs_dash = false; + continue; + } + + if matches!(ch, ' ' | '-' | '_' | '/' | '&' | '.' | '+' | '\'') { + needs_dash = !output.is_empty(); + } + } + + output.trim_matches('-').to_string() + } + + fn parse_abbrev(text: &str) -> Option { + let cleaned = text.replace(',', "").trim().to_string(); + parse_abbreviated_number(&cleaned) + } + + fn parse_duration(text: &str) -> u32 { + text.trim().parse::().unwrap_or(0) * 60 + } + + fn parse_relative_or_date(text: &str) -> Option { + let trimmed = text.trim(); + if trimmed.is_empty() { + return None; + } + + if let Some(value) = Self::parse_absolute_date(trimmed) { + return Some(value); + } + + let regex = Regex::new( + r"(?i)^(?P\d+)\s+(?Pminute|minutes|hour|hours|day|days|week|weeks|month|months|year|years)\s+ago$", + ) + .ok()?; + let captures = regex.captures(trimmed)?; + let amount = captures.name("amount")?.as_str().parse::().ok()?; + let unit = captures.name("unit")?.as_str().to_ascii_lowercase(); + let now = Utc::now(); + let timestamp = match unit.as_str() { + "minute" | "minutes" => now - ChronoDuration::minutes(amount), + "hour" | "hours" => now - ChronoDuration::hours(amount), + "day" | "days" => now - ChronoDuration::days(amount), + "week" | "weeks" => now - ChronoDuration::weeks(amount), + "month" | "months" => now - ChronoDuration::days(amount * 30), + "year" | "years" => now - ChronoDuration::days(amount * 365), + _ => return None, + }; + Some(timestamp.timestamp() as u64) + } + + fn parse_absolute_date(text: &str) -> Option { + let normalized = text.replace('.', ""); + let date = NaiveDate::parse_from_str(&normalized, "%b %d, %Y").ok()?; + let dt = date.and_hms_opt(0, 0, 0)?; + Some(DateTime::::from_naive_utc_and_offset(dt, Utc).timestamp() as u64) + } + + fn push_unique(target: &Arc>>, item: FilterOption) { + if item.id.is_empty() || item.title.is_empty() { + return; + } + if let Ok(mut values) = target.write() { + if !values.iter().any(|existing| { + existing.id.eq_ignore_ascii_case(&item.id) + || existing.title.eq_ignore_ascii_case(&item.title) + }) { + values.push(item); + } + } + } + + async fn fetch_markdown(&self, requester: &mut Requester, url: &str) -> Result { + let mirror_url = self.mirror_url(url); + requester + .get(&mirror_url, Some(Version::HTTP_11)) + .await + .map_err(|error| Error::from(format!("request failed for {url}: {error}"))) + } + + fn pagination_last_page(markdown: &str) -> Option { + let regex = Regex::new(r"\[(?P\d+)\]\(https://sextb\.net/[^)]*pg-(?P=page)\)") + .ok()?; + regex + .captures_iter(markdown) + .filter_map(|captures| captures.name("page")?.as_str().parse::().ok()) + .max() + } + + fn parse_genre_options(markdown: &str) -> Result> { + let regex = Self::regex( + r#"(?m)^\*\s+\[(?P[^\]]+)\]\(https://sextb\.net/genre/(?P<slug>[^)\s]+)[^)]*\)\((?P<count>\d+)\)$"#, + )?; + let mut items = Vec::new(); + for captures in regex.captures_iter(markdown) { + let title = Self::normalize_title(captures.name("title").map(|value| value.as_str()).unwrap_or_default()); + let slug = captures + .name("slug") + .map(|value| value.as_str()) + .unwrap_or_default() + .to_string(); + if title.is_empty() || slug.is_empty() { + continue; + } + items.push(FilterOption { id: slug, title }); + } + Ok(items) + } + + fn parse_studio_options(markdown: &str) -> Result<Vec<FilterOption>> { + let regex = Self::regex( + r#"(?m)^\[(?P<title>[^\]]+)\]\(https://sextb\.net/studio/(?P<slug>[^)\s]+)[^)]*\)"#, + )?; + let mut items = Vec::new(); + for captures in regex.captures_iter(markdown) { + let title = + Self::normalize_title(captures.name("title").map(|value| value.as_str()).unwrap_or_default()); + let slug = captures + .name("slug") + .map(|value| value.as_str()) + .unwrap_or_default() + .to_string(); + if title.is_empty() || slug.is_empty() { + continue; + } + items.push(FilterOption { id: slug, title }); + } + Ok(items) + } + + fn parse_actress_options(markdown: &str) -> Result<Vec<FilterOption>> { + let regex = Self::regex( + r#"(?m)^\[\!\[Image \d+: (?P<title>[^\]]+)\]\(https://[^)]+\)\s+(?P=title)\s+[0-9.]+[KM]\s+\d{2}/\d{2}/\d{4}\]\(https://sextb\.net/actress/[^)\s]+\)$"#, + )?; + let mut items = Vec::new(); + for captures in regex.captures_iter(markdown) { + let title = + Self::normalize_title(captures.name("title").map(|value| value.as_str()).unwrap_or_default()); + if title.is_empty() { + continue; + } + let slug_source = title.split('(').next().unwrap_or(&title).trim(); + let slug = Self::slugify(slug_source); + if slug.is_empty() { + continue; + } + items.push(FilterOption { id: slug, title: slug_source.to_string() }); + } + Ok(items) + } + + async fn load_genres(base_url: &str, genres: Arc<RwLock<Vec<FilterOption>>>) -> Result<()> { + let provider = Self { + url: base_url.to_string(), + genres: Arc::clone(&genres), + studios: Arc::new(RwLock::new(vec![])), + actresses: Arc::new(RwLock::new(vec![])), + }; + let mut requester = Requester::new(); + let markdown = provider.fetch_markdown(&mut requester, &format!("{base_url}/genres")).await?; + for item in Self::parse_genre_options(&markdown)? { + Self::push_unique(&genres, item); + } + Ok(()) + } + + async fn load_studios(base_url: &str, studios: Arc<RwLock<Vec<FilterOption>>>) -> Result<()> { + let provider = Self { + url: base_url.to_string(), + genres: Arc::new(RwLock::new(vec![])), + studios: Arc::clone(&studios), + actresses: Arc::new(RwLock::new(vec![])), + }; + let mut requester = Requester::new(); + let markdown = + provider.fetch_markdown(&mut requester, &format!("{base_url}/list-studios")).await?; + for item in Self::parse_studio_options(&markdown)? { + Self::push_unique(&studios, item); + } + Ok(()) + } + + async fn load_actresses( + base_url: &str, + actresses: Arc<RwLock<Vec<FilterOption>>>, + ) -> Result<()> { + let provider = Self { + url: base_url.to_string(), + genres: Arc::new(RwLock::new(vec![])), + studios: Arc::new(RwLock::new(vec![])), + actresses: Arc::clone(&actresses), + }; + let mut requester = Requester::new(); + let first_page = + provider.fetch_markdown(&mut requester, &format!("{base_url}/list-actress")).await?; + for item in Self::parse_actress_options(&first_page)? { + Self::push_unique(&actresses, item); + } + + let last_page = Self::pagination_last_page(&first_page).unwrap_or(1).min(80); + for page in 2..=last_page { + let markdown = provider + .fetch_markdown(&mut requester, &format!("{base_url}/list-actress/pg-{page}")) + .await?; + for item in Self::parse_actress_options(&markdown)? { + Self::push_unique(&actresses, item); + } + } + Ok(()) + } + + fn resolve_section(options: &ServerOptions) -> String { + match options.filter.as_deref().unwrap_or("censored") { + "uncensored" => "uncensored".to_string(), + "subtitle" => "subtitle".to_string(), + "amateur" => "amateur".to_string(), + _ => "censored".to_string(), + } + } + + fn resolve_sort(sort: &str) -> String { + match sort { + "release" => "release".to_string(), + "liked" => "liked".to_string(), + "viewed" | "popular" => "viewed".to_string(), + "favorite" => "favorite".to_string(), + _ => "latest".to_string(), + } + } + + fn find_option_slug(options: &Arc<RwLock<Vec<FilterOption>>>, value: &str) -> Option<String> { + let normalized = value.trim().to_ascii_lowercase(); + let slug = Self::slugify(&normalized); + let options = options.read().ok()?; + let option = options.iter().find(|item| { + item.id.eq_ignore_ascii_case(value) + || item.title.eq_ignore_ascii_case(value) + || Self::slugify(&item.title) == slug + })?; + Some(option.id.clone()) + } + + fn resolve_option_target(&self, options: &ServerOptions, sort: &str) -> Target { + if let Some(value) = options.stars.as_deref() { + if let Some(slug) = Self::find_option_slug(&self.actresses, value) { + return Target::Actress { slug }; + } + } + if let Some(value) = options.sites.as_deref() { + if let Some(slug) = Self::find_option_slug(&self.studios, value) { + return Target::Studio { slug }; + } + } + if let Some(value) = options.categories.as_deref() { + if let Some(slug) = Self::find_option_slug(&self.genres, value) { + return Target::Genre { slug }; + } + } + Target::Section { + section: Self::resolve_section(options), + sort: Self::resolve_sort(sort), + } + } + + fn resolve_query_target(&self, query: &str, sort: &str) -> Target { + if let Some(slug) = Self::find_option_slug(&self.actresses, query) { + return Target::Actress { slug }; + } + if let Some(slug) = Self::find_option_slug(&self.studios, query) { + return Target::Studio { slug }; + } + if let Some(slug) = Self::find_option_slug(&self.genres, query) { + return Target::Genre { slug }; + } + + let slug = Self::slugify(query); + if slug.is_empty() { + return Target::Section { + section: "censored".to_string(), + sort: Self::resolve_sort(sort), + }; + } + Target::Search { slug } + } + + fn build_url_for_target(&self, target: &Target, page: u16) -> String { + match target { + Target::Section { section, sort } => { + let mut url = Url::parse(&format!("{}/{}", self.url, section)) + .expect("valid section base url"); + { + let mut pairs = url.query_pairs_mut(); + pairs.append_pair("genre", "all"); + pairs.append_pair("studio", "all"); + pairs.append_pair("quality", "all"); + pairs.append_pair("year", "all"); + if sort != "latest" { + pairs.append_pair("sort", sort); + } + if page > 1 { + pairs.append_pair("pg", &page.to_string()); + } + } + url.to_string() + } + Target::Search { slug } => { + if page > 1 { + format!("{}/search/{}/pg-{}", self.url, slug, page) + } else { + format!("{}/search/{}", self.url, slug) + } + } + Target::Genre { slug } => { + if page > 1 { + format!("{}/genre/{}/pg-{}", self.url, slug, page) + } else { + format!("{}/genre/{}", self.url, slug) + } + } + Target::Studio { slug } => { + if page > 1 { + format!("{}/studio/{}/pg-{}", self.url, slug, page) + } else { + format!("{}/studio/{}", self.url, slug) + } + } + Target::Actress { slug } => { + if page > 1 { + format!("{}/actress/{}/pg-{}", self.url, slug, page) + } else { + format!("{}/actress/{}", self.url, slug) + } + } + } + } + + fn parse_list_prefix_tags(prefix: &str) -> Vec<String> { + let cleaned = Self::collapse_whitespace(prefix); + if cleaned.is_empty() { + return vec![]; + } + + let bracket_regex = Regex::new(r"\[(?P<tag>[^\]]+)\]").ok(); + let mut tags = Vec::new(); + if let Some(regex) = bracket_regex { + for captures in regex.captures_iter(&cleaned) { + if let Some(tag) = captures.name("tag").map(|value| value.as_str().trim()) { + if !tag.is_empty() { + tags.push(tag.to_string()); + } + } + } + } + + let without_brackets = Regex::new(r"\[[^\]]+\]") + .ok() + .map(|regex| regex.replace_all(&cleaned, " ").to_string()) + .unwrap_or(cleaned.clone()); + for token in without_brackets.split(" ") { + let value = Self::collapse_whitespace(token); + if !value.is_empty() { + tags.push(value); + } + } + + tags.sort(); + tags.dedup(); + tags + } + + fn parse_list_items(markdown: &str) -> Result<Vec<ListItemSeed>> { + let regex = Self::regex( + r#"(?s)\[\!\[Image \d+: (?P<alt>.*?)\]\((?P<thumb>https?://[^)]+)\)(?P<body>.*?)\]\((?P<url>https://sextb\.net/[^)]+)\)"#, + )?; + let meta_regex = Self::regex( + r#"(?s)^(?P<head>.*?)\s+(?P<quality>HD|SD)\s+(?P<duration>\d+)\s+min\s+(?P<code>[A-Za-z0-9\-]+)$"#, + )?; + let mut items = Vec::new(); + + for captures in regex.captures_iter(markdown) { + let url = captures + .name("url") + .map(|value| value.as_str()) + .unwrap_or_default() + .to_string(); + if !url.starts_with("https://sextb.net/") { + continue; + } + + let body = Self::collapse_whitespace( + captures.name("body").map(|value| value.as_str()).unwrap_or_default(), + ); + let meta = match meta_regex.captures(&body) { + Some(meta) => meta, + None => continue, + }; + + let alt_title = Self::normalize_title( + captures.name("alt").map(|value| value.as_str()).unwrap_or_default(), + ); + let head = Self::normalize_title(meta.name("head").map(|value| value.as_str()).unwrap_or_default()); + let title = if !alt_title.is_empty() && head.contains(&alt_title) { + alt_title.clone() + } else if !alt_title.is_empty() && !alt_title.eq_ignore_ascii_case("Julia") { + alt_title.clone() + } else { + head.clone() + }; + if title.is_empty() { + continue; + } + + let prefix = if !alt_title.is_empty() && head.contains(&alt_title) { + let prefix = head.split(&alt_title).next().unwrap_or_default(); + prefix.to_string() + } else { + String::new() + }; + + let slug = url + .trim_start_matches("https://sextb.net/") + .trim_end_matches('/') + .to_string(); + if slug.is_empty() { + continue; + } + + let thumb = captures + .name("thumb") + .map(|value| value.as_str()) + .unwrap_or_default() + .to_string(); + if thumb.is_empty() { + continue; + } + + let quality = meta + .name("quality") + .map(|value| value.as_str().to_string()) + .filter(|value| !value.is_empty()); + let code = meta + .name("code") + .map(|value| value.as_str().to_string()) + .filter(|value| !value.is_empty()); + let duration = meta + .name("duration") + .map(|value| Self::parse_duration(value.as_str())) + .unwrap_or(0); + + let mut tags = Self::parse_list_prefix_tags(&prefix); + if let Some(value) = quality.clone() { + tags.push(value); + } + if let Some(value) = code.clone() { + tags.push(value.clone()); + tags.push(value.to_ascii_uppercase()); + } + tags.sort(); + tags.dedup(); + + items.push(ListItemSeed { + id: slug.clone(), + title, + url, + thumb, + duration, + quality, + code, + tags, + }); + } + + Ok(items) + } + + fn extract_link_texts(line: &str) -> Result<Vec<(String, String)>> { + let regex = Self::regex(r#"\[(?P<title>[^\]]+)\]\((?P<url>https://sextb\.net/[^)\s]+)"#)?; + Ok(regex + .captures_iter(line) + .filter_map(|captures| { + let title = captures.name("title")?.as_str().trim().trim_matches('*').to_string(); + let url = captures.name("url")?.as_str().to_string(); + Some((title, url)) + }) + .collect()) + } + + fn detail_link_slug(url: &str, prefix: &str) -> Option<String> { + let path = url.strip_prefix("https://sextb.net/")?; + let path = path.strip_prefix(prefix)?; + Some(path.trim_start_matches('/').trim_end_matches('/').to_string()) + } + + fn parse_detail(markdown: &str) -> Result<DetailMetadata> { + let mut metadata = DetailMetadata::default(); + + let aliases_regex = Self::regex(r#"(?m)^# \*\*(?P<body>.+?)\*\*$"#)?; + if let Some(captures) = aliases_regex.captures(markdown) { + let body = captures.name("body").map(|value| value.as_str()).unwrap_or_default(); + let link_regex = Self::regex(r#"\[(?P<alias>[^\]]+)\]\(https://sextb\.net/search/[^)]+\)"#)?; + for alias_match in link_regex.captures_iter(body) { + if let Some(alias) = alias_match.name("alias").map(|value| value.as_str().trim()) { + if !alias.is_empty() { + metadata.aliases.push(alias.to_string()); + } + } + } + } + + let line_regex = Self::regex(r#"(?m)^(?P<key>Director|Label|Studio|Cast\(s\)|Genre\(s\)|Quality|Release Date|Runtimes|Added|Viewed|Description):\s*(?P<value>.+)$"#)?; + for captures in line_regex.captures_iter(markdown) { + let key = captures.name("key").map(|value| value.as_str()).unwrap_or_default(); + let value = captures.name("value").map(|value| value.as_str()).unwrap_or_default().trim(); + match key { + "Director" => { + metadata.director = Self::extract_link_texts(value)? + .first() + .map(|(title, _)| title.clone()); + } + "Label" => { + metadata.label = Self::extract_link_texts(value)? + .first() + .map(|(title, _)| title.clone()); + } + "Studio" => { + if let Some((title, url)) = Self::extract_link_texts(value)?.into_iter().next() { + let slug = + Self::detail_link_slug(&url, "studio").unwrap_or_else(|| Self::slugify(&title)); + metadata.studio = Some((title, slug)); + } + } + "Cast(s)" => { + let mut cast = Vec::new(); + for (title, url) in Self::extract_link_texts(value)? { + let slug = Self::detail_link_slug(&url, "actress") + .unwrap_or_else(|| Self::slugify(&title)); + cast.push((title, slug)); + } + metadata.cast = cast; + } + "Genre(s)" => { + let mut genres = Vec::new(); + for (title, url) in Self::extract_link_texts(value)? { + let slug = Self::detail_link_slug(&url, "genre") + .unwrap_or_else(|| Self::slugify(&title)); + genres.push((title, slug)); + } + metadata.genres = genres; + } + "Quality" => { + let quality = value.trim_matches('*').trim().to_string(); + if !quality.is_empty() { + metadata.quality = Some(quality); + } + } + "Release Date" => { + let cleaned = value.trim_matches('*').trim(); + metadata.release_date = Self::parse_absolute_date(cleaned); + } + "Runtimes" => {} + "Added" => { + let cleaned = value.trim_matches('*').trim(); + metadata.added_at = Self::parse_relative_or_date(cleaned); + } + "Viewed" => { + metadata.views = Self::parse_abbrev(value.trim()); + } + "Description" => {} + _ => {} + } + } + + let stats_regex = + Self::regex(r#"(?m)(?P<likes>[0-9.]+[KM]?)\s+Liked(?:\s+(?P<dislikes>[0-9.]+[KM]?)\s+Disliked)?(?:\s+(?P<favorites>[0-9.]+[KM]?)\s+Favorited)?\s+(?P<views>[0-9.]+[KM]?)\s+Views"#)?; + if let Some(captures) = stats_regex.captures(markdown) { + if metadata.views.is_none() { + metadata.views = captures + .name("views") + .and_then(|value| Self::parse_abbrev(value.as_str())); + } + if let (Some(likes), Some(dislikes)) = ( + captures + .name("likes") + .and_then(|value| Self::parse_abbrev(value.as_str())), + captures + .name("dislikes") + .and_then(|value| Self::parse_abbrev(value.as_str())), + ) { + let total = likes + dislikes; + if total > 0 { + metadata.rating = Some((likes as f32 / total as f32) * 100.0); + } + } + } + + metadata.aliases.sort(); + metadata.aliases.dedup(); + Ok(metadata) + } + + fn apply_detail_metadata(&self, seed: ListItemSeed, detail: DetailMetadata) -> VideoItem { + let mut item = VideoItem::new( + seed.id, + seed.title, + seed.url, + CHANNEL_ID.to_string(), + seed.thumb.clone(), + seed.duration, + ); + + item.preview = Some(seed.thumb); + item.views = detail.views; + item.rating = detail.rating; + item.uploadedAt = detail.added_at.or(detail.release_date); + + if !detail.cast.is_empty() { + item.uploader = Some( + detail + .cast + .iter() + .map(|(name, _)| name.clone()) + .collect::<Vec<_>>() + .join(", "), + ); + if let Some((_, slug)) = detail.cast.first() { + item.uploaderUrl = Some(format!("{}/actress/{}", self.url, slug)); + } + } else if let Some((studio, slug)) = detail.studio.as_ref() { + item.uploader = Some(studio.clone()); + item.uploaderUrl = Some(format!("{}/studio/{}", self.url, slug)); + } + + let mut tags = seed.tags; + if let Some(quality) = seed.quality { + tags.push(quality); + } + if let Some(code) = seed.code { + tags.push(code); + } + for alias in detail.aliases { + tags.push(alias); + } + if let Some(director) = detail.director { + tags.push(director); + } + if let Some(label) = detail.label { + tags.push(label); + } + if let Some((studio, _)) = detail.studio { + tags.push(studio); + } + for (cast, _) in detail.cast { + tags.push(cast); + } + for (genre, _) in detail.genres { + tags.push(genre); + } + if let Some(quality) = detail.quality { + tags.push(quality); + } + + tags.retain(|value| !value.trim().is_empty()); + tags.sort(); + tags.dedup(); + if !tags.is_empty() { + item.tags = Some(tags); + } + + item + } + + async fn enrich_seed(&self, seed: ListItemSeed, options: &ServerOptions) -> VideoItem { + let mut requester = requester_or_default(options, CHANNEL_ID, "enrich_seed"); + let detail_fetch = timeout( + StdDuration::from_secs(10), + self.fetch_markdown(&mut requester, &seed.url), + ) + .await; + + match detail_fetch { + Ok(Ok(markdown)) => match Self::parse_detail(&markdown) { + Ok(detail) => self.apply_detail_metadata(seed.clone(), detail), + Err(error) => { + report_provider_error_background(CHANNEL_ID, "parse_detail", &error.to_string()); + self.apply_detail_metadata(seed, DetailMetadata::default()) + } + }, + Ok(Err(error)) => { + report_provider_error_background(CHANNEL_ID, "fetch_detail", &error.to_string()); + self.apply_detail_metadata(seed, DetailMetadata::default()) + } + Err(_) => { + report_provider_error_background(CHANNEL_ID, "fetch_detail_timeout", &seed.url); + self.apply_detail_metadata(seed, DetailMetadata::default()) + } + } + } + + async fn fetch_items_for_url( + &self, + cache: VideoCache, + url: String, + per_page_limit: usize, + options: &ServerOptions, + ) -> Result<Vec<VideoItem>> { + if let Some((time, items)) = cache.get(&url) { + if time.elapsed().unwrap_or_default().as_secs() < 60 * 15 { + return Ok(items.clone()); + } + } + + let mut requester = requester_or_default(options, CHANNEL_ID, "fetch_items_for_url"); + let markdown = timeout( + StdDuration::from_secs(12), + self.fetch_markdown(&mut requester, &url), + ) + .await + .map_err(|_| Error::from(format!("list request timed out for {url}")))??; + + let seeds = Self::parse_list_items(&markdown)? + .into_iter() + .take(per_page_limit.max(1)) + .collect::<Vec<_>>(); + if seeds.is_empty() { + return Ok(vec![]); + } + + let items = stream::iter(seeds.into_iter().map(|seed| { + let provider = self.clone(); + let options = options.clone(); + async move { provider.enrich_seed(seed, &options).await } + })) + .buffer_unordered(4) + .collect::<Vec<_>>() + .await; + + if !items.is_empty() { + cache.insert(url, items.clone()); + } + + Ok(items) + } + + async fn get( + &self, + cache: VideoCache, + page: u16, + sort: &str, + per_page_limit: usize, + options: ServerOptions, + ) -> Result<Vec<VideoItem>> { + let target = self.resolve_option_target(&options, sort); + let url = self.build_url_for_target(&target, page); + self.fetch_items_for_url(cache, url, per_page_limit, &options) + .await + } + + async fn query( + &self, + cache: VideoCache, + page: u16, + sort: &str, + query: &str, + per_page_limit: usize, + options: ServerOptions, + ) -> Result<Vec<VideoItem>> { + let target = self.resolve_query_target(query, sort); + let url = self.build_url_for_target(&target, page); + self.fetch_items_for_url(cache, url, per_page_limit, &options) + .await + } +} + +#[async_trait] +impl Provider for SextbProvider { + async fn get_videos( + &self, + cache: VideoCache, + pool: DbPool, + sort: String, + query: Option<String>, + page: String, + per_page: String, + options: ServerOptions, + ) -> Vec<VideoItem> { + let _ = pool; + let page = page.parse::<u16>().unwrap_or(1); + let per_page_limit = per_page.parse::<usize>().unwrap_or(30); + + let result = match query { + Some(query) if !query.trim().is_empty() => { + self.query(cache, page, &sort, &query, per_page_limit, options) + .await + } + _ => self.get(cache, page, &sort, per_page_limit, options).await, + }; + + match result { + Ok(videos) => videos, + Err(error) => { + report_provider_error(CHANNEL_ID, "get_videos", &error.to_string()).await; + vec![] + } + } + } + + fn get_channel(&self, clientversion: ClientVersion) -> Option<Channel> { + Some(self.build_channel(clientversion)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn provider() -> SextbProvider { + SextbProvider { + url: BASE_URL.to_string(), + genres: Arc::new(RwLock::new(vec![FilterOption { + id: "big-tits".to_string(), + title: "Big Tits".to_string(), + }])), + studios: Arc::new(RwLock::new(vec![FilterOption { + id: "madonna".to_string(), + title: "Madonna".to_string(), + }])), + actresses: Arc::new(RwLock::new(vec![FilterOption { + id: "yui-hatano".to_string(), + title: "Yui Hatano".to_string(), + }])), + } + } + + #[test] + fn builds_section_page_two_url() { + let url = provider().build_url_for_target( + &Target::Section { + section: "amateur".to_string(), + sort: "viewed".to_string(), + }, + 2, + ); + assert_eq!( + url, + "https://sextb.net/amateur?genre=all&studio=all&quality=all&year=all&sort=viewed&pg=2" + ); + } + + #[test] + fn builds_search_page_two_url() { + let url = provider().build_url_for_target( + &Target::Search { + slug: "yui-hatano".to_string(), + }, + 2, + ); + assert_eq!(url, "https://sextb.net/search/yui-hatano/pg-2"); + } + + #[test] + fn matches_query_to_filter_targets() { + let provider = provider(); + + match provider.resolve_query_target("Big Tits", "latest") { + Target::Genre { slug } => assert_eq!(slug, "big-tits"), + other => panic!("unexpected target: {other:?}"), + } + + match provider.resolve_query_target("Madonna", "latest") { + Target::Studio { slug } => assert_eq!(slug, "madonna"), + other => panic!("unexpected target: {other:?}"), + } + + match provider.resolve_query_target("Yui Hatano", "latest") { + Target::Actress { slug } => assert_eq!(slug, "yui-hatano"), + other => panic!("unexpected target: {other:?}"), + } + } + + #[test] + fn parses_detail_metadata_block() { + let markdown = r#" +57 Liked 16 Disliked 88 Favorited 45.2K Views + + Director: [**Himurokku**](https://sextb.net/director/himurokku "Himurokku") + Label: [**Madonna**](https://sextb.net/label/madonna "Madonna") + Studio: [**Madonna**](https://sextb.net/studio/madonna "MADONNA") + Cast(s): [**Miyuu Imai**](https://sextb.net/actress/miyuu-imai "Miyuu Imai") + Genre(s): [**Big Tits**](https://sextb.net/genre/big-tits "Big Tits"), [**Creampie**](https://sextb.net/genre/creampie "Creampie") + Quality: **HD** + Release Date: **Mar. 24, 2026** + Added: **2 days ago** + Viewed: 45.2K +"#; + + let detail = SextbProvider::parse_detail(markdown).expect("detail should parse"); + assert_eq!(detail.director.as_deref(), Some("Himurokku")); + assert_eq!(detail.label.as_deref(), Some("Madonna")); + assert_eq!(detail.studio.as_ref().map(|value| value.0.as_str()), Some("Madonna")); + assert_eq!(detail.cast.len(), 1); + assert_eq!(detail.genres.len(), 2); + assert_eq!(detail.views, Some(45200)); + assert!(detail.rating.unwrap_or_default() > 70.0); + assert_eq!(detail.quality.as_deref(), Some("HD")); + assert!(detail.release_date.is_some()); + assert!(detail.added_at.is_some()); + } +} diff --git a/src/proxies/noodlemagazine.rs b/src/proxies/noodlemagazine.rs index 6be39f6..e5d4e80 100644 --- a/src/proxies/noodlemagazine.rs +++ b/src/proxies/noodlemagazine.rs @@ -14,8 +14,7 @@ const FIREFOX_USER_AGENT: &str = "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0"; const HTML_ACCEPT: &str = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"; -const IMAGE_ACCEPT: &str = - "image/avif,image/webp,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5"; +const IMAGE_ACCEPT: &str = "image/avif,image/webp,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5"; #[derive(Debug, Clone)] pub struct NoodlemagazineProxy {} @@ -321,11 +320,7 @@ pub async fn get_image( if needs_warmup { let _ = requester - .get_with_headers( - image_url.as_str(), - headers.clone(), - Some(Version::HTTP_11), - ) + .get_with_headers(image_url.as_str(), headers.clone(), Some(Version::HTTP_11)) .await; headers = NoodlemagazineProxy::image_headers(&requester, image_url.as_str()); upstream = requester diff --git a/src/proxies/pimpbunny.rs b/src/proxies/pimpbunny.rs index dd0d453..4b12615 100644 --- a/src/proxies/pimpbunny.rs +++ b/src/proxies/pimpbunny.rs @@ -51,7 +51,10 @@ impl PimpbunnyProxy { fn html_headers_with_referer(referer: &str) -> Vec<(String, String)> { vec![ ("Referer".to_string(), referer.to_string()), - ("User-Agent".to_string(), Self::FIREFOX_USER_AGENT.to_string()), + ( + "User-Agent".to_string(), + Self::FIREFOX_USER_AGENT.to_string(), + ), ("Accept".to_string(), Self::HTML_ACCEPT.to_string()), ("Accept-Language".to_string(), "en-US,en;q=0.9".to_string()), ] @@ -80,7 +83,8 @@ impl PimpbunnyProxy { } fn extract_json_ld_video(text: &str) -> Option<Value> { - let script_regex = Regex::new(r#"(?s)<script[^>]+application/ld\+json[^>]*>(.*?)</script>"#).ok()?; + let script_regex = + Regex::new(r#"(?s)<script[^>]+application/ld\+json[^>]*>(.*?)</script>"#).ok()?; for captures in script_regex.captures_iter(text) { let raw = captures.get(1).map(|value| value.as_str().trim())?; @@ -228,7 +232,8 @@ mod tests { </script> "#; - let json_ld = PimpbunnyProxy::extract_json_ld_video(html).expect("video object should parse"); + let json_ld = + PimpbunnyProxy::extract_json_ld_video(html).expect("video object should parse"); assert_eq!( PimpbunnyProxy::extract_stream_url(&json_ld).as_deref(), Some("https://cdn.example/graph.mp4") diff --git a/src/proxies/pimpbunnythumb.rs b/src/proxies/pimpbunnythumb.rs index cdc508c..82df158 100644 --- a/src/proxies/pimpbunnythumb.rs +++ b/src/proxies/pimpbunnythumb.rs @@ -12,8 +12,7 @@ const FIREFOX_USER_AGENT: &str = "Mozilla/5.0 (X11; Linux x86_64; rv:147.0) Gecko/20100101 Firefox/147.0"; const HTML_ACCEPT: &str = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"; -const IMAGE_ACCEPT: &str = - "image/avif,image/webp,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5"; +const IMAGE_ACCEPT: &str = "image/avif,image/webp,image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5"; fn root_referer() -> &'static str { "https://pimpbunny.com/" @@ -157,7 +156,9 @@ mod tests { #[test] fn rejects_non_thumb_or_non_pimpbunny_urls() { - assert!(!is_allowed_thumb_url("http://pimpbunny.com/contents/videos_screenshots/x.jpg")); + assert!(!is_allowed_thumb_url( + "http://pimpbunny.com/contents/videos_screenshots/x.jpg" + )); assert!(!is_allowed_thumb_url( "https://pimpbunny.com/videos/example-video/" )); diff --git a/src/util/requester.rs b/src/util/requester.rs index 39aadb8..14555ec 100644 --- a/src/util/requester.rs +++ b/src/util/requester.rs @@ -563,10 +563,8 @@ mod tests { let b = Requester::new(); let origin = "https://shared-cookie-requester-test.invalid/"; - a.cookie_jar.add_cookie_str( - "shared_cookie=1; Path=/; SameSite=Lax", - origin, - ); + a.cookie_jar + .add_cookie_str("shared_cookie=1; Path=/; SameSite=Lax", origin); let cookie_header = b .cookie_header_for_url("https://shared-cookie-requester-test.invalid/path")