diff --git a/src/providers/beeg.rs b/src/providers/beeg.rs index 69231c7..05684f8 100644 --- a/src/providers/beeg.rs +++ b/src/providers/beeg.rs @@ -1,6 +1,6 @@ use crate::DbPool; use crate::api::ClientVersion; -use crate::providers::{Provider, report_provider_error_background}; +use crate::providers::{Provider, report_provider_error, report_provider_error_background}; use crate::util::cache::VideoCache; use crate::util::parse_abbreviated_number; use crate::videos::{ServerOptions, VideoItem}; @@ -11,6 +11,7 @@ use htmlentity::entity::{ICodedDataTrait, decode}; use serde_json::Value; use std::sync::{Arc, RwLock}; use std::thread; +use std::time::Duration; use std::vec; error_chain! { @@ -73,14 +74,15 @@ impl BeegProvider { }; rt.block_on(async move { - if let Err(e) = Self::load_sites(sites).await { - eprintln!("beeg load_sites failed: {}", e); - } - if let Err(e) = Self::load_categories(categories).await { - eprintln!("beeg load_categories failed: {}", e); - } - if let Err(e) = Self::load_stars(stars).await { - eprintln!("beeg load_stars failed: {}", e); + match Self::fetch_tags().await { + Ok(json) => { + Self::load_sites(&json, sites); + Self::load_categories(&json, categories); + Self::load_stars(&json, stars); + } + Err(e) => { + report_provider_error("beeg", "init.fetch_tags", &e.to_string()).await; + } } }); }); @@ -88,24 +90,36 @@ impl BeegProvider { async fn fetch_tags() -> Result { let mut requester = util::requester::Requester::new(); - let text = match requester - .get( - "https://store.externulls.com/tag/facts/tags?get_original=true&slug=index", - None, - ) - .await - { - Ok(text) => text, - Err(e) => { - eprintln!("beeg fetch_tags failed: {}", e); - return Err(ErrorKind::Parse("failed to fetch tags".into()).into()); + let endpoints = [ + "https://store.externulls.com/tag/facts/tags?get_original=true&slug=index", + "https://store.externulls.com/tag/facts/tags?slug=index", + ]; + let mut errors: Vec = vec![]; + + for endpoint in endpoints { + for attempt in 1..=3 { + match requester.get(endpoint, None).await { + Ok(text) => match serde_json::from_str::(&text) { + Ok(json) => return Ok(json), + Err(e) => { + errors + .push(format!("endpoint={endpoint}; attempt={attempt}; parse={e}")); + } + }, + Err(e) => { + errors.push(format!( + "endpoint={endpoint}; attempt={attempt}; request={e}" + )); + } + } + tokio::time::sleep(Duration::from_millis(250 * attempt as u64)).await; } - }; - Ok(serde_json::from_str(&text)?) + } + + Err(ErrorKind::Parse(format!("failed to fetch tags; {}", errors.join(" | "))).into()) } - async fn load_stars(stars: Arc>>) -> Result<()> { - let json = Self::fetch_tags().await?; + fn load_stars(json: &Value, stars: Arc>>) { let arr = json .get("human") .and_then(|v| v.as_array().map(|v| v.as_slice())) @@ -124,11 +138,9 @@ impl BeegProvider { ); } } - Ok(()) } - async fn load_categories(categories: Arc>>) -> Result<()> { - let json = Self::fetch_tags().await?; + fn load_categories(json: &Value, categories: Arc>>) { let arr = json .get("other") .and_then(|v| v.as_array().map(|v| v.as_slice())) @@ -147,11 +159,9 @@ impl BeegProvider { ); } } - Ok(()) } - async fn load_sites(sites: Arc>>) -> Result<()> { - let json = Self::fetch_tags().await?; + fn load_sites(json: &Value, sites: Arc>>) { let arr = json .get("productions") .and_then(|v| v.as_array().map(|v| v.as_slice())) @@ -170,7 +180,6 @@ impl BeegProvider { ); } } - Ok(()) } fn push_unique(target: &Arc>>, item: FilterOption) { diff --git a/src/providers/mod.rs b/src/providers/mod.rs index 1bde43e..17d9471 100644 --- a/src/providers/mod.rs +++ b/src/providers/mod.rs @@ -36,6 +36,7 @@ pub mod porn00; pub mod pornzog; pub mod sxyprn; pub mod tnaflix; +pub mod xfree; pub mod xxthots; pub mod youjizz; // pub mod pornxp; @@ -143,6 +144,10 @@ pub static ALL_PROVIDERS: Lazy> = Lazy::new(| "xxdbx", Arc::new(xxdbx::XxdbxProvider::new()) as DynProvider, ); + m.insert( + "xfree", + Arc::new(xfree::XfreeProvider::new()) as DynProvider, + ); m.insert( "hqporner", Arc::new(hqporner::HqpornerProvider::new()) as DynProvider, diff --git a/src/providers/xfree.rs b/src/providers/xfree.rs new file mode 100644 index 0000000..c51d502 --- /dev/null +++ b/src/providers/xfree.rs @@ -0,0 +1,751 @@ +use crate::DbPool; +use crate::api::ClientVersion; +use crate::providers::{Provider, report_provider_error_background, requester_or_default}; +use crate::status::*; +use crate::util::cache::VideoCache; +use crate::util::discord::send_discord_error_report; +use crate::util::parse_abbreviated_number; +use crate::util::time::parse_time_to_seconds; +use crate::videos::{ServerOptions, VideoFormat, VideoItem}; +use async_trait::async_trait; +use error_chain::error_chain; +use futures::stream::{FuturesUnordered, StreamExt}; +use htmlentity::entity::{ICodedDataTrait, decode}; +use regex::Regex; +use std::collections::HashSet; +use std::fmt::Write; +use std::vec; +use url::form_urlencoded::{Serializer, parse}; + +error_chain! { + foreign_links { + Io(std::io::Error); + HttpRequest(wreq::Error); + } +} + +#[derive(Debug, Clone)] +pub struct XfreeProvider { + url: String, +} + +#[derive(Debug, Clone)] +struct RawListingItem { + id: String, + title: String, + detail_url: String, + thumb: String, + duration: u32, + views: Option, + uploader: Option, + tags: Vec, +} + +impl XfreeProvider { + pub fn new() -> Self { + Self { + url: "https://www.xfree.com".to_string(), + } + } + + fn build_channel(&self, _clientversion: ClientVersion) -> Channel { + Channel { + id: "xfree".to_string(), + name: "XFree".to_string(), + description: "Short NSFW clips from xfree.com".to_string(), + premium: false, + favicon: "https://www.google.com/s2/favicons?sz=64&domain=xfree.com".to_string(), + status: "active".to_string(), + categories: vec![ + "all".to_string(), + "straight".to_string(), + "gay".to_string(), + "trans".to_string(), + ], + options: vec![ + ChannelOption { + id: "sort".to_string(), + title: "Sort".to_string(), + description: "Sort listing preference".to_string(), + systemImage: "list.number".to_string(), + colorName: "blue".to_string(), + options: vec![ + FilterOption { + id: "trending".to_string(), + title: "Trending".to_string(), + }, + FilterOption { + id: "latest".to_string(), + title: "Latest".to_string(), + }, + ], + multiSelect: false, + }, + ChannelOption { + id: "category".to_string(), + title: "Category".to_string(), + description: "Audience/category feed".to_string(), + systemImage: "line.horizontal.3.decrease.circle".to_string(), + colorName: "green".to_string(), + options: vec![ + FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }, + FilterOption { + id: "straight".to_string(), + title: "Straight".to_string(), + }, + FilterOption { + id: "gay".to_string(), + title: "Gay".to_string(), + }, + FilterOption { + id: "trans".to_string(), + title: "Trans".to_string(), + }, + ], + multiSelect: false, + }, + ], + nsfw: true, + cacheDuration: Some(300), + } + } + + fn normalize_ws(input: &str) -> String { + input.split_whitespace().collect::>().join(" ") + } + + fn decode_html(input: &str) -> String { + decode(input.as_bytes()) + .to_string() + .unwrap_or_else(|_| input.to_string()) + } + + fn clean_media_url(raw: &str) -> String { + let mut out = raw + .trim_matches(|c: char| c == '"' || c == '\'' || c == '\\' || c.is_whitespace()) + .to_string(); + out = out + .replace("\\u0026", "&") + .replace("\\u002F", "/") + .replace("\\/", "/") + .replace("&", "&"); + out = out + .trim_end_matches(|c: char| matches!(c, ',' | ';' | ')' | ']' | '}')) + .to_string(); + if out.starts_with("//") { + return format!("https:{out}"); + } + out + } + + fn is_downloadable_media_url(url: &str) -> bool { + let lower = url.to_ascii_lowercase(); + (lower.starts_with("http://") || lower.starts_with("https://")) + && (lower.contains(".mp4") || lower.contains(".m3u8")) + } + + fn absolute_url(&self, path: &str) -> String { + if path.starts_with("http://") || path.starts_with("https://") { + return path.to_string(); + } + if path.starts_with("//") { + return format!("https:{path}"); + } + if path.starts_with('/') { + return format!("{}{}", self.url, path); + } + format!("{}/{}", self.url, path.trim_start_matches('/')) + } + + fn encode_query_value(value: &str) -> String { + let mut serializer = Serializer::new(String::new()); + serializer.append_pair("q", value); + let encoded = serializer.finish(); + encoded.strip_prefix("q=").unwrap_or(&encoded).to_string() + } + + fn category_value(options: &ServerOptions) -> String { + options + .category + .clone() + .unwrap_or_else(|| "all".to_string()) + .to_ascii_lowercase() + } + + fn sort_value(options: &ServerOptions) -> String { + options + .sort + .clone() + .unwrap_or_else(|| "trending".to_string()) + .to_ascii_lowercase() + } + + fn category_suffix(category: &str) -> Option<&'static str> { + match category { + "gay" => Some("gay"), + "trans" => Some("trans"), + "straight" => Some("straight"), + _ => None, + } + } + + fn with_page(mut url: String, page: u8) -> String { + if page <= 1 { + return url; + } + if url.contains('?') { + url.push_str(&format!("&page={page}")); + } else { + url.push_str(&format!("?page={page}")); + } + url + } + + fn build_listing_urls(&self, page: u8, query: &str, options: &ServerOptions) -> Vec { + let category = Self::category_value(options); + let sort = Self::sort_value(options); + let encoded_query = Self::encode_query_value(query.trim()); + let category_suffix = Self::category_suffix(&category); + let mut urls = Vec::new(); + + if !query.trim().is_empty() { + if let Some(suffix) = category_suffix { + urls.push(Self::with_page( + format!("{}/search-{suffix}?q={encoded_query}", self.url), + page, + )); + } + urls.push(Self::with_page( + format!("{}/search?q={encoded_query}", self.url), + page, + )); + return urls; + } + + let base_category_url = match category_suffix { + Some(suffix) => format!("{}/{}", self.url, suffix), + None => self.url.clone(), + }; + + if sort == "latest" { + urls.push(Self::with_page( + format!("{}/latest", base_category_url), + page, + )); + urls.push(Self::with_page( + format!("{base_category_url}?sort=latest"), + page, + )); + } + urls.push(Self::with_page(base_category_url, page)); + + urls + } + + fn extract_href_param(href: &str, key: &str) -> Option { + let query = href.split('?').nth(1)?; + for (k, v) in parse(query.as_bytes()) { + if k == key { + return Some(v.into_owned()); + } + } + None + } + + fn strip_html_tags(text: &str) -> String { + let Ok(tags_re) = Regex::new(r"(?is)<[^>]+>") else { + return text.to_string(); + }; + tags_re.replace_all(text, " ").to_string() + } + + fn extract_duration_seconds(text: &str) -> Option { + let Ok(duration_re) = Regex::new(r"\b(\d{1,2}:\d{2}(?::\d{2})?)\b") else { + return None; + }; + if let Some(caps) = duration_re.captures(text) { + if let Some(raw) = caps.get(1) { + return parse_time_to_seconds(raw.as_str()).map(|v| v as u32); + } + } + + let Ok(iso_re) = Regex::new(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?") else { + return None; + }; + let caps = iso_re.captures(text)?; + let h = caps + .get(1) + .and_then(|m| m.as_str().parse::().ok()) + .unwrap_or(0); + let m = caps + .get(2) + .and_then(|m| m.as_str().parse::().ok()) + .unwrap_or(0); + let s = caps + .get(3) + .and_then(|m| m.as_str().parse::().ok()) + .unwrap_or(0); + let total = h.saturating_mul(3600) + m.saturating_mul(60) + s; + if total > 0 { Some(total) } else { None } + } + + fn extract_views(text: &str) -> Option { + let Ok(views_re) = Regex::new(r"(?i)\b([0-9]+(?:\.[0-9]+)?\s*[kmb]?)\s*(?:views?|view)\b") + else { + return None; + }; + let raw = views_re + .captures(text) + .and_then(|c| c.get(1)) + .map(|m| m.as_str().to_string())?; + parse_abbreviated_number(&raw) + } + + fn extract_tags(text: &str) -> Vec { + let Ok(tag_re) = Regex::new(r"#([A-Za-z0-9_]+)") else { + return vec![]; + }; + let mut seen = HashSet::new(); + let mut tags = vec![]; + for caps in tag_re.captures_iter(text) { + let Some(raw) = caps.get(1).map(|m| m.as_str()) else { + continue; + }; + let tag = raw.to_ascii_lowercase(); + if seen.insert(tag.clone()) { + tags.push(tag); + } + } + tags + } + + fn extract_thumb_from_segment(&self, segment: &str) -> String { + let Ok(thumb_re) = Regex::new( + r#"(?is)(https?://[^"' <]*(?:thumbs|peek|prbn)\.xfree\.com[^"' <]*\.(?:jpg|jpeg|png|webp))"#, + ) else { + return String::new(); + }; + if let Some(m) = thumb_re.captures(segment).and_then(|c| c.get(1)) { + return m.as_str().to_string(); + } + + let Ok(img_attr_re) = Regex::new(r#"(?is)(?:src|data-src|data-original)="([^"]+)""#) else { + return String::new(); + }; + if let Some(m) = img_attr_re.captures(segment).and_then(|c| c.get(1)) { + return self.absolute_url(m.as_str()); + } + + String::new() + } + + fn extract_quality_from_url(url: &str) -> String { + let Ok(q_re) = Regex::new(r"(?i)(\d{3,4})p") else { + return "1080".to_string(); + }; + if let Some(q) = q_re.captures(url).and_then(|c| c.get(1)) { + return q.as_str().to_string(); + } + if url.to_ascii_lowercase().contains(".m3u8") { + return "hls".to_string(); + } + "1080".to_string() + } + + fn parse_listing_items(&self, html: &str) -> Vec { + if html.trim().is_empty() { + return vec![]; + } + let Ok(link_re) = Regex::new( + r#"(?is)]+href="(?P/(?:video(?:-[a-z]+)?\?id=\d+[^"]*))"[^>]*>(?P.*?)"#, + ) else { + return vec![]; + }; + let Ok(title_attr_re) = Regex::new(r#"(?is)\btitle="([^"]+)""#) else { + return vec![]; + }; + let Ok(uploader_re) = + Regex::new(r#"(?is)href="/(?:u|user|profile)/[^"]+"[^>]*>\s*([^<]{2,64})\s*<"#) + else { + return vec![]; + }; + + let mut items = vec![]; + let mut seen_ids = HashSet::new(); + + for caps in link_re.captures_iter(html) { + let Some(full) = caps.get(0) else { + continue; + }; + let href = caps.name("href").map(|m| m.as_str()).unwrap_or(""); + let body = caps.name("body").map(|m| m.as_str()).unwrap_or(""); + let Some(id) = Self::extract_href_param(href, "id") else { + continue; + }; + if !seen_ids.insert(id.clone()) { + continue; + } + + let seg_start = full.start().saturating_sub(400); + let seg_end = (full.end() + 1600).min(html.len()); + let segment = html.get(seg_start..seg_end).unwrap_or(full.as_str()); + + let title_from_attr = title_attr_re + .captures(full.as_str()) + .and_then(|c| c.get(1)) + .map(|m| m.as_str().to_string()) + .unwrap_or_default(); + let title_from_body = Self::strip_html_tags(body); + let title_from_href = Self::extract_href_param(href, "title") + .map(|s| s.replace('-', " ")) + .unwrap_or_default(); + let title = Self::normalize_ws(&Self::decode_html(if !title_from_attr.is_empty() { + &title_from_attr + } else if !title_from_body.trim().is_empty() { + &title_from_body + } else { + &title_from_href + })); + if title.is_empty() { + continue; + } + + let thumb = self.extract_thumb_from_segment(segment); + let duration = Self::extract_duration_seconds(segment).unwrap_or(0); + let views = Self::extract_views(segment); + let uploader = uploader_re + .captures(segment) + .and_then(|c| c.get(1)) + .map(|m| Self::normalize_ws(m.as_str())) + .filter(|s| !s.is_empty()); + let tags = Self::extract_tags(segment); + + items.push(RawListingItem { + id, + title, + detail_url: self.absolute_url(href), + thumb, + duration, + views, + uploader, + tags, + }); + } + + items + } + + fn extract_media_urls(&self, html: &str) -> Vec { + let mut urls = vec![]; + let mut seen = HashSet::new(); + + let patterns = [ + r#"https?:\\?/\\?/[^"' <>\s]+?\.(?:mp4|m3u8)[^"' <>\s]*"#, + r#"https?://[^"' <>\s]+?\.(?:mp4|m3u8)[^"' <>\s]*"#, + ]; + + for pattern in patterns { + let Ok(re) = Regex::new(pattern) else { + continue; + }; + for m in re.find_iter(html) { + let cleaned = Self::clean_media_url(m.as_str()); + if !Self::is_downloadable_media_url(&cleaned) { + continue; + } + if seen.insert(cleaned.clone()) { + urls.push(cleaned); + } + } + } + + urls + } + + fn extract_detail_tags(html: &str) -> Vec { + let Ok(tag_link_re) = Regex::new(r#"(?is)href="/tag(?:-[a-z]+)?/([^"?#]+)"#) else { + return vec![]; + }; + let mut seen = HashSet::new(); + let mut tags = vec![]; + for caps in tag_link_re.captures_iter(html) { + let Some(raw) = caps.get(1).map(|m| m.as_str()) else { + continue; + }; + let tag = raw + .replace('-', " ") + .replace("%20", " ") + .trim() + .to_ascii_lowercase(); + if tag.is_empty() { + continue; + } + if seen.insert(tag.clone()) { + tags.push(tag); + } + } + tags + } + + fn extract_detail_thumb(&self, html: &str) -> String { + self.extract_thumb_from_segment(html) + } + + async fn fetch_detailed_video_item( + &self, + raw: RawListingItem, + mut requester: crate::util::requester::Requester, + ) -> Option { + let detail_html = match requester.get(&raw.detail_url, None).await { + Ok(t) => t, + Err(e) => { + report_provider_error_background( + "xfree", + "detail.request", + &format!("url={}; error={e}", raw.detail_url), + ); + return None; + } + }; + + let media_urls = self.extract_media_urls(&detail_html); + if media_urls.is_empty() { + report_provider_error_background( + "xfree", + "detail.media", + &format!("no_media_url_found; url={}", raw.detail_url), + ); + return None; + } + + let thumb = if raw.thumb.is_empty() { + self.extract_detail_thumb(&detail_html) + } else { + raw.thumb.clone() + }; + + let duration = if raw.duration > 0 { + raw.duration + } else { + Self::extract_duration_seconds(&detail_html).unwrap_or(0) + }; + + let mut tags = raw.tags.clone(); + for tag in Self::extract_detail_tags(&detail_html) { + if !tags.iter().any(|t| t == &tag) { + tags.push(tag); + } + } + + let mut formats = vec![]; + for media_url in media_urls.iter() { + let format_kind = if media_url.to_ascii_lowercase().contains(".m3u8") { + "m3u8".to_string() + } else { + "mp4".to_string() + }; + let quality = Self::extract_quality_from_url(media_url); + formats.push(VideoFormat::new(media_url.clone(), quality, format_kind)); + } + + let selected_url = media_urls + .iter() + .find(|u| u.to_ascii_lowercase().contains(".mp4")) + .cloned() + .unwrap_or_else(|| media_urls.first().cloned().unwrap_or_default()); + if selected_url.is_empty() { + return None; + } + + let mut item = VideoItem::new( + raw.id, + raw.title, + selected_url, + "xfree".to_string(), + thumb, + duration, + ) + .formats(formats) + .preview( + media_urls + .first() + .cloned() + .unwrap_or_else(|| raw.detail_url.clone()), + ); + + if let Some(views) = raw.views { + item = item.views(views); + } + if let Some(uploader) = raw.uploader { + item = item.uploader(uploader); + } + if !tags.is_empty() { + item = item.tags(tags); + } + + Some(item) + } + + async fn parse_video_items_from_html( + &self, + html: String, + requester: crate::util::requester::Requester, + ) -> Vec { + let listing_items = self.parse_listing_items(&html); + if listing_items.is_empty() { + return vec![]; + } + + let mut in_flight = FuturesUnordered::new(); + let mut items = vec![]; + let mut iter = listing_items.into_iter(); + const MAX_IN_FLIGHT: usize = 5; + + loop { + while in_flight.len() < MAX_IN_FLIGHT { + let Some(raw) = iter.next() else { + break; + }; + in_flight.push(self.fetch_detailed_video_item(raw, requester.clone())); + } + + let Some(result) = in_flight.next().await else { + break; + }; + if let Some(item) = result { + items.push(item); + } + } + + items + } + + async fn fetch( + &self, + cache: VideoCache, + page: u8, + query: &str, + options: ServerOptions, + ) -> Result> { + let urls = self.build_listing_urls(page, query, &options); + let mut requester = requester_or_default(&options, "xfree", "fetch"); + let mut stale_items = vec![]; + + for url in urls { + if let Some((time, items)) = cache.get(&url) { + if time.elapsed().unwrap_or_default().as_secs() < 300 { + return Ok(items.clone()); + } + if stale_items.is_empty() && !items.is_empty() { + stale_items = items.clone(); + } + } + + let html = match requester.get(&url, None).await { + Ok(text) => text, + Err(e) => { + report_provider_error_background( + "xfree", + "listing.request", + &format!("url={url}; error={e}"), + ); + continue; + } + }; + + let items = self + .parse_video_items_from_html(html, requester.clone()) + .await; + if !items.is_empty() { + cache.remove(&url); + cache.insert(url, items.clone()); + return Ok(items); + } + } + + Ok(stale_items) + } +} + +#[async_trait] +impl Provider for XfreeProvider { + async fn get_videos( + &self, + cache: VideoCache, + _pool: DbPool, + _sort: String, + query: Option, + page: String, + _per_page: String, + options: ServerOptions, + ) -> Vec { + let page = page.parse::().unwrap_or(1); + let query = query.unwrap_or_default(); + + match self.fetch(cache, page, &query, options).await { + Ok(v) => v, + Err(e) => { + let mut chain_str = String::new(); + for (i, cause) in e.iter().enumerate() { + let _ = writeln!(chain_str, "{}. {}", i + 1, cause); + } + send_discord_error_report( + e.to_string(), + Some(chain_str), + Some("Xfree Provider"), + Some("Failed to fetch videos"), + file!(), + line!(), + module_path!(), + ) + .await; + vec![] + } + } + } + + fn get_channel(&self, clientversion: ClientVersion) -> Option { + Some(self.build_channel(clientversion)) + } +} + +#[cfg(test)] +mod tests { + use super::XfreeProvider; + + #[test] + fn parses_listing_items_from_html() { + let provider = XfreeProvider::new(); + let html = r#" + + + 1:23 + 12.5K views + + "#; + let items = provider.parse_listing_items(html); + assert_eq!(items.len(), 1); + assert_eq!(items[0].id, "12345"); + assert_eq!(items[0].title, "BBC Anal Test"); + assert_eq!(items[0].duration, 83); + assert_eq!(items[0].views, Some(12_500)); + } + + #[test] + fn extracts_media_urls_from_escaped_html() { + let provider = XfreeProvider::new(); + let html = r#" + + "#; + let urls = provider.extract_media_urls(html); + assert_eq!(urls.len(), 2); + assert!(urls.iter().any(|u| u.contains("clip_720p.mp4"))); + assert!(urls.iter().any(|u| u.contains("master.m3u8"))); + } +}