From e6eb85cd5aa162e9ac98d1e0eafd2224d0a0f72a Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 17 Apr 2026 21:03:01 +0000 Subject: [PATCH] archivebate still needs work --- src/providers/archivebate.rs | 263 ++++++++++++++++++++++++++++++++++- 1 file changed, 259 insertions(+), 4 deletions(-) diff --git a/src/providers/archivebate.rs b/src/providers/archivebate.rs index 6525fea..6996232 100644 --- a/src/providers/archivebate.rs +++ b/src/providers/archivebate.rs @@ -7,10 +7,11 @@ use crate::status::*; use crate::util::cache::VideoCache; use crate::util::parse_abbreviated_number; use crate::util::time::parse_time_to_seconds; -use crate::videos::{ServerOptions, VideoItem}; +use crate::videos::{ServerOptions, VideoFormat, VideoItem}; use async_trait::async_trait; use chrono::{Duration as ChronoDuration, Utc}; use error_chain::error_chain; +use futures::stream::{self, StreamExt}; use htmlentity::entity::{ICodedDataTrait, decode}; use percent_encoding::{NON_ALPHANUMERIC, percent_decode_str, utf8_percent_encode}; use regex::Regex; @@ -20,6 +21,8 @@ use serde_json::Value; use std::collections::HashSet; use std::sync::{Arc, RwLock}; use std::thread; +use std::time::Duration as StdDuration; +use tokio::time::timeout; pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata = crate::providers::ProviderChannelMetadata { @@ -281,6 +284,9 @@ impl ArchivebateProvider { if value.starts_with("http://") || value.starts_with("https://") { return value.to_string(); } + if value.starts_with("//") { + return format!("https:{value}"); + } format!( "{}/{}", self.url.trim_end_matches('/'), @@ -711,6 +717,213 @@ impl ArchivebateProvider { Ok(items) } + fn parse_mixin_packed_eval(html: &str) -> Option { + let eval_regex = Regex::new( + r#"(?s)eval\(function\(p,a,c,k,e,d\)\{.*?\}\('(?P.*?)',\s*(?P[0-9]+),\s*(?P[0-9]+),\s*'(?P.*?)'\.split\('\|'\)"#, + ) + .ok()?; + let captures = eval_regex.captures(html)?; + let payload_raw = captures.name("payload")?.as_str(); + let radix = captures.name("radix")?.as_str().parse::().ok()?; + let count = captures.name("count")?.as_str().parse::().ok()?; + if !(2..=36).contains(&radix) { + return None; + } + + let payload = Self::unescape_js_single_quoted(payload_raw); + let tokens_raw = captures.name("tokens")?.as_str(); + let tokens = tokens_raw.split('|').collect::>(); + let mut unpacked = payload; + + for index in (0..count).rev() { + let Some(token) = tokens.get(index) else { + continue; + }; + if token.is_empty() { + continue; + } + let key = Self::to_radix(index, radix); + let pattern = format!(r"\b{}\b", regex::escape(&key)); + let re = Regex::new(&pattern).ok()?; + unpacked = re.replace_all(&unpacked, *token).into_owned(); + } + + Some(unpacked) + } + + fn unescape_js_single_quoted(value: &str) -> String { + let mut output = String::with_capacity(value.len()); + let mut chars = value.chars(); + while let Some(character) = chars.next() { + if character != '\\' { + output.push(character); + continue; + } + let Some(next) = chars.next() else { + break; + }; + match next { + '\\' => output.push('\\'), + '\'' => output.push('\''), + '"' => output.push('"'), + 'n' => output.push('\n'), + 'r' => output.push('\r'), + 't' => output.push('\t'), + _ => output.push(next), + } + } + output + } + + fn to_radix(mut value: usize, radix: u32) -> String { + if value == 0 { + return "0".to_string(); + } + let alphabet = b"0123456789abcdefghijklmnopqrstuvwxyz"; + let mut out = Vec::new(); + while value > 0 { + let digit = value % radix as usize; + out.push(alphabet[digit] as char); + value /= radix as usize; + } + out.iter().rev().collect() + } + + fn extract_mixdrop_media_url(html: &str) -> Option { + let direct_regex = Regex::new(r#"MDCore\.wurl\s*=\s*"([^"]+)""#).ok()?; + if let Some(url) = direct_regex + .captures(html) + .and_then(|captures| captures.get(1).map(|value| value.as_str().to_string())) + { + return Some(Self::normalize_possible_protocol_relative(&url)); + } + + let unpacked = Self::parse_mixin_packed_eval(html)?; + let unpacked_regex = Regex::new(r#"MDCore\.wurl\s*=\s*"([^"]+)""#).ok()?; + unpacked_regex + .captures(&unpacked) + .and_then(|captures| captures.get(1).map(|value| value.as_str().to_string())) + .map(|value| Self::normalize_possible_protocol_relative(&value)) + } + + fn normalize_possible_protocol_relative(value: &str) -> String { + let trimmed = value.trim(); + if trimmed.starts_with("//") { + format!("https:{trimmed}") + } else { + trimmed.to_string() + } + } + + fn host_from_url(url: &str) -> Option { + let parsed = url::Url::parse(url).ok()?; + parsed.host_str().map(|value| value.to_ascii_lowercase()) + } + + fn is_mixdrop_host(url: &str) -> bool { + let Some(host) = Self::host_from_url(url) else { + return false; + }; + host.contains("mixdrop") || host.contains("m1xdrop") + } + + fn first_video_source_from_html(html: &str) -> Option { + let document = Html::parse_document(html); + let source_selector = Selector::parse("video source[src]").ok()?; + let video_src_selector = Selector::parse("video[src]").ok()?; + + if let Some(value) = document + .select(&source_selector) + .next() + .and_then(|node| node.value().attr("src")) + { + return Some(value.to_string()); + } + document + .select(&video_src_selector) + .next() + .and_then(|node| node.value().attr("src")) + .map(|value| value.to_string()) + } + + fn first_iframe_source_from_html(html: &str) -> Option { + let document = Html::parse_document(html); + let iframe_selector = Selector::parse("iframe[src]").ok()?; + document + .select(&iframe_selector) + .next() + .and_then(|node| node.value().attr("src")) + .map(|value| value.to_string()) + } + + async fn resolve_mixdrop_media_from_iframe( + &self, + iframe_url: &str, + referer: &str, + options: &ServerOptions, + ) -> Option { + let mut requester = requester_or_default(options, CHANNEL_ID, "resolve_mixdrop_media"); + let iframe_html = requester + .get_with_headers( + iframe_url, + self.html_headers(referer), + Some(wreq::Version::HTTP_11), + ) + .await + .ok()?; + Self::extract_mixdrop_media_url(&iframe_html) + } + + async fn enrich_video(&self, item: VideoItem, options: &ServerOptions) -> VideoItem { + let page_url = item.url.clone(); + let mut requester = requester_or_default(options, CHANNEL_ID, "archivebate.enrich_video"); + let detail_html = match requester + .get_with_headers( + &page_url, + self.html_headers(&format!("{}/", self.url)), + Some(wreq::Version::HTTP_11), + ) + .await + { + Ok(value) => value, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "enrich_video.fetch_detail", + &format!("url={page_url}; error={error}"), + ); + return item; + } + }; + + let mut media_url = Self::first_video_source_from_html(&detail_html) + .map(|value| self.absolute_url(&value)); + + if media_url.is_none() { + let iframe_url = Self::first_iframe_source_from_html(&detail_html) + .map(|value| self.absolute_url(&value)); + if let Some(iframe_url) = iframe_url { + if Self::is_mixdrop_host(&iframe_url) { + if let Some(resolved) = self + .resolve_mixdrop_media_from_iframe(&iframe_url, &page_url, options) + .await + { + media_url = Some(resolved); + } + } + } + } + + let Some(media_url) = media_url else { + return item; + }; + + let format = VideoFormat::new(media_url, "source".to_string(), "mp4".to_string()); + let mut enriched = item; + enriched.formats = Some(vec![format]); + enriched + } + fn extract_csrf_token(html: &str) -> Option { let regex = Regex::new(r#" { - self.query(cache, page, per_page, &query, options).await + self.query(cache, page, per_page, &query, options.clone()).await } - _ => self.get_default(cache, page, per_page, options).await, + _ => self.get_default(cache, page, per_page, options.clone()).await, }; match result { - Ok(videos) => videos, + Ok(videos) => { + if videos.is_empty() { + return videos; + } + stream::iter(videos.into_iter().map(|video| { + let provider = self.clone(); + let options = options.clone(); + async move { + let timeout_result = timeout( + StdDuration::from_secs(8), + provider.enrich_video(video.clone(), &options), + ) + .await; + match timeout_result { + Ok(enriched) => enriched, + Err(_) => video, + } + } + })) + .buffer_unordered(4) + .collect::>() + .await + } Err(error) => { report_provider_error(CHANNEL_ID, "get_videos", &error.to_string()).await; vec![] @@ -1065,3 +1300,23 @@ impl Provider for ArchivebateProvider { Some(self.build_channel(clientversion)) } } + +#[cfg(test)] +mod tests { + use super::ArchivebateProvider; + + #[test] + fn extracts_mixdrop_wurl_from_packed_eval() { + let html = r#" + +"#; + let actual = ArchivebateProvider::extract_mixdrop_media_url(html) + .expect("expected mixdrop media url"); + assert_eq!( + actual, + "https://o230m5y6z.mxcontent.net/v2/r6pkwozjber741.mp4?s=TvNTJe3_z_6nKveumEHk8Q&e=1776460168" + ); + } +}