archivebate still needs work

2026-04-17 21:03:01 +00:00
parent 33ec098aae
commit e6eb85cd5a
1 changed files with 259 additions and 4 deletions
--- a/src/providers/archivebate.rs
+++ b/src/providers/archivebate.rs
@@ -7,10 +7,11 @@ use crate::status::*;
 use crate::util::cache::VideoCache;
 use crate::util::parse_abbreviated_number;
 use crate::util::time::parse_time_to_seconds;
-use crate::videos::{ServerOptions, VideoItem};
+use crate::videos::{ServerOptions, VideoFormat, VideoItem};
 use async_trait::async_trait;
 use chrono::{Duration as ChronoDuration, Utc};
 use error_chain::error_chain;
 use futures::stream::{self, StreamExt};
 use htmlentity::entity::{ICodedDataTrait, decode};
 use percent_encoding::{NON_ALPHANUMERIC, percent_decode_str, utf8_percent_encode};
 use regex::Regex;
@@ -20,6 +21,8 @@ use serde_json::Value;
 use std::collections::HashSet;
 use std::sync::{Arc, RwLock};
 use std::thread;
 use std::time::Duration as StdDuration;
 use tokio::time::timeout;
 pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata =
    crate::providers::ProviderChannelMetadata {
@@ -281,6 +284,9 @@ impl ArchivebateProvider {
        if value.starts_with("http://") || value.starts_with("https://") {
            return value.to_string();
        }
        if value.starts_with("//") {
            return format!("https:{value}");
        }
        format!(
            "{}/{}",
            self.url.trim_end_matches('/'),
@@ -711,6 +717,213 @@ impl ArchivebateProvider {
        Ok(items)
    }
    fn parse_mixin_packed_eval(html: &str) -> Option<String> {
        let eval_regex = Regex::new(
            r#"(?s)eval\(function\(p,a,c,k,e,d\)\{.*?\}\('(?P<payload>.*?)',\s*(?P<radix>[0-9]+),\s*(?P<count>[0-9]+),\s*'(?P<tokens>.*?)'\.split\('\|'\)"#,
        )
        .ok()?;
        let captures = eval_regex.captures(html)?;
        let payload_raw = captures.name("payload")?.as_str();
        let radix = captures.name("radix")?.as_str().parse::<u32>().ok()?;
        let count = captures.name("count")?.as_str().parse::<usize>().ok()?;
        if !(2..=36).contains(&radix) {
            return None;
        }
        let payload = Self::unescape_js_single_quoted(payload_raw);
        let tokens_raw = captures.name("tokens")?.as_str();
        let tokens = tokens_raw.split('|').collect::<Vec<_>>();
        let mut unpacked = payload;
        for index in (0..count).rev() {
            let Some(token) = tokens.get(index) else {
                continue;
            };
            if token.is_empty() {
                continue;
            }
            let key = Self::to_radix(index, radix);
            let pattern = format!(r"\b{}\b", regex::escape(&key));
            let re = Regex::new(&pattern).ok()?;
            unpacked = re.replace_all(&unpacked, *token).into_owned();
        }
        Some(unpacked)
    }
    fn unescape_js_single_quoted(value: &str) -> String {
        let mut output = String::with_capacity(value.len());
        let mut chars = value.chars();
        while let Some(character) = chars.next() {
            if character != '\\' {
                output.push(character);
                continue;
            }
            let Some(next) = chars.next() else {
                break;
            };
            match next {
                '\\' => output.push('\\'),
                '\'' => output.push('\''),
                '"' => output.push('"'),
                'n' => output.push('\n'),
                'r' => output.push('\r'),
                't' => output.push('\t'),
                _ => output.push(next),
            }
        }
        output
    }
    fn to_radix(mut value: usize, radix: u32) -> String {
        if value == 0 {
            return "0".to_string();
        }
        let alphabet = b"0123456789abcdefghijklmnopqrstuvwxyz";
        let mut out = Vec::new();
        while value > 0 {
            let digit = value % radix as usize;
            out.push(alphabet[digit] as char);
            value /= radix as usize;
        }
        out.iter().rev().collect()
    }
    fn extract_mixdrop_media_url(html: &str) -> Option<String> {
        let direct_regex = Regex::new(r#"MDCore\.wurl\s*=\s*"([^"]+)""#).ok()?;
        if let Some(url) = direct_regex
            .captures(html)
            .and_then(|captures| captures.get(1).map(|value| value.as_str().to_string()))
        {
            return Some(Self::normalize_possible_protocol_relative(&url));
        }
        let unpacked = Self::parse_mixin_packed_eval(html)?;
        let unpacked_regex = Regex::new(r#"MDCore\.wurl\s*=\s*"([^"]+)""#).ok()?;
        unpacked_regex
            .captures(&unpacked)
            .and_then(|captures| captures.get(1).map(|value| value.as_str().to_string()))
            .map(|value| Self::normalize_possible_protocol_relative(&value))
    }
    fn normalize_possible_protocol_relative(value: &str) -> String {
        let trimmed = value.trim();
        if trimmed.starts_with("//") {
            format!("https:{trimmed}")
        } else {
            trimmed.to_string()
        }
    }
    fn host_from_url(url: &str) -> Option<String> {
        let parsed = url::Url::parse(url).ok()?;
        parsed.host_str().map(|value| value.to_ascii_lowercase())
    }
    fn is_mixdrop_host(url: &str) -> bool {
        let Some(host) = Self::host_from_url(url) else {
            return false;
        };
        host.contains("mixdrop") || host.contains("m1xdrop")
    }
    fn first_video_source_from_html(html: &str) -> Option<String> {
        let document = Html::parse_document(html);
        let source_selector = Selector::parse("video source[src]").ok()?;
        let video_src_selector = Selector::parse("video[src]").ok()?;
        if let Some(value) = document
            .select(&source_selector)
            .next()
            .and_then(|node| node.value().attr("src"))
        {
            return Some(value.to_string());
        }
        document
            .select(&video_src_selector)
            .next()
            .and_then(|node| node.value().attr("src"))
            .map(|value| value.to_string())
    }
    fn first_iframe_source_from_html(html: &str) -> Option<String> {
        let document = Html::parse_document(html);
        let iframe_selector = Selector::parse("iframe[src]").ok()?;
        document
            .select(&iframe_selector)
            .next()
            .and_then(|node| node.value().attr("src"))
            .map(|value| value.to_string())
    }
    async fn resolve_mixdrop_media_from_iframe(
        &self,
        iframe_url: &str,
        referer: &str,
        options: &ServerOptions,
    ) -> Option<String> {
        let mut requester = requester_or_default(options, CHANNEL_ID, "resolve_mixdrop_media");
        let iframe_html = requester
            .get_with_headers(
                iframe_url,
                self.html_headers(referer),
                Some(wreq::Version::HTTP_11),
            )
            .await
            .ok()?;
        Self::extract_mixdrop_media_url(&iframe_html)
    }
    async fn enrich_video(&self, item: VideoItem, options: &ServerOptions) -> VideoItem {
        let page_url = item.url.clone();
        let mut requester = requester_or_default(options, CHANNEL_ID, "archivebate.enrich_video");
        let detail_html = match requester
            .get_with_headers(
                &page_url,
                self.html_headers(&format!("{}/", self.url)),
                Some(wreq::Version::HTTP_11),
            )
            .await
        {
            Ok(value) => value,
            Err(error) => {
                report_provider_error_background(
                    CHANNEL_ID,
                    "enrich_video.fetch_detail",
                    &format!("url={page_url}; error={error}"),
                );
                return item;
            }
        };
        let mut media_url = Self::first_video_source_from_html(&detail_html)
            .map(|value| self.absolute_url(&value));
        if media_url.is_none() {
            let iframe_url = Self::first_iframe_source_from_html(&detail_html)
                .map(|value| self.absolute_url(&value));
            if let Some(iframe_url) = iframe_url {
                if Self::is_mixdrop_host(&iframe_url) {
                    if let Some(resolved) = self
                        .resolve_mixdrop_media_from_iframe(&iframe_url, &page_url, options)
                        .await
                    {
                        media_url = Some(resolved);
                    }
                }
            }
        }
        let Some(media_url) = media_url else {
            return item;
        };
        let format = VideoFormat::new(media_url, "source".to_string(), "mp4".to_string());
        let mut enriched = item;
        enriched.formats = Some(vec![format]);
        enriched
    }
    fn extract_csrf_token(html: &str) -> Option<String> {
        let regex = Regex::new(r#"<meta name="csrf-token" content="([^"]+)""#).ok()?;
        regex
@@ -1047,13 +1260,35 @@ impl Provider for ArchivebateProvider {
        let result = match query {
            Some(query) if !query.trim().is_empty() => {
-                self.query(cache, page, per_page, &query, options).await
+                self.query(cache, page, per_page, &query, options.clone()).await
            }
-            _ => self.get_default(cache, page, per_page, options).await,
+            _ => self.get_default(cache, page, per_page, options.clone()).await,
        };
        match result {
-            Ok(videos) => videos,
+            Ok(videos) => {
                if videos.is_empty() {
                    return videos;
                }
                stream::iter(videos.into_iter().map(|video| {
                    let provider = self.clone();
                    let options = options.clone();
                    async move {
                        let timeout_result = timeout(
                            StdDuration::from_secs(8),
                            provider.enrich_video(video.clone(), &options),
                        )
                        .await;
                        match timeout_result {
                            Ok(enriched) => enriched,
                            Err(_) => video,
                        }
                    }
                }))
                .buffer_unordered(4)
                .collect::<Vec<_>>()
                .await
            }
            Err(error) => {
                report_provider_error(CHANNEL_ID, "get_videos", &error.to_string()).await;
                vec![]
@@ -1065,3 +1300,23 @@ impl Provider for ArchivebateProvider {
        Some(self.build_channel(clientversion))
    }
 }
 #[cfg(test)]
 mod tests {
    use super::ArchivebateProvider;
    #[test]
    fn extracts_mixdrop_wurl_from_packed_eval() {
        let html = r#"
 <script>
 eval(function(p,a,c,k,e,d){e=function(c){return c};if(!''.replace(/^/,String)){while(c--){d[c]=k[c]||c}k=[function(e){return d[e]}];e=function(){return'\\w+'};c=1};while(c--){if(k[c]){p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c])}}return p}('1.2="//3.4.5/6/7.8?9=a&b=c";',13,13,'|MDCore|wurl|o230m5y6z|mxcontent|net|v2|r6pkwozjber741|mp4|s|TvNTJe3_z_6nKveumEHk8Q|e|1776460168'.split('|'),0,{}))
 </script>
 "#;
        let actual = ArchivebateProvider::extract_mixdrop_media_url(html)
            .expect("expected mixdrop media url");
        assert_eq!(
            actual,
            "https://o230m5y6z.mxcontent.net/v2/r6pkwozjber741.mp4?s=TvNTJe3_z_6nKveumEHk8Q&e=1776460168"
        );
    }
 }