archivebate still needs work

This commit is contained in:
Simon
2026-04-17 21:03:01 +00:00
parent 33ec098aae
commit e6eb85cd5a

View File

@@ -7,10 +7,11 @@ use crate::status::*;
use crate::util::cache::VideoCache; use crate::util::cache::VideoCache;
use crate::util::parse_abbreviated_number; use crate::util::parse_abbreviated_number;
use crate::util::time::parse_time_to_seconds; use crate::util::time::parse_time_to_seconds;
use crate::videos::{ServerOptions, VideoItem}; use crate::videos::{ServerOptions, VideoFormat, VideoItem};
use async_trait::async_trait; use async_trait::async_trait;
use chrono::{Duration as ChronoDuration, Utc}; use chrono::{Duration as ChronoDuration, Utc};
use error_chain::error_chain; use error_chain::error_chain;
use futures::stream::{self, StreamExt};
use htmlentity::entity::{ICodedDataTrait, decode}; use htmlentity::entity::{ICodedDataTrait, decode};
use percent_encoding::{NON_ALPHANUMERIC, percent_decode_str, utf8_percent_encode}; use percent_encoding::{NON_ALPHANUMERIC, percent_decode_str, utf8_percent_encode};
use regex::Regex; use regex::Regex;
@@ -20,6 +21,8 @@ use serde_json::Value;
use std::collections::HashSet; use std::collections::HashSet;
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
use std::thread; use std::thread;
use std::time::Duration as StdDuration;
use tokio::time::timeout;
pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata = pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata =
crate::providers::ProviderChannelMetadata { crate::providers::ProviderChannelMetadata {
@@ -281,6 +284,9 @@ impl ArchivebateProvider {
if value.starts_with("http://") || value.starts_with("https://") { if value.starts_with("http://") || value.starts_with("https://") {
return value.to_string(); return value.to_string();
} }
if value.starts_with("//") {
return format!("https:{value}");
}
format!( format!(
"{}/{}", "{}/{}",
self.url.trim_end_matches('/'), self.url.trim_end_matches('/'),
@@ -711,6 +717,213 @@ impl ArchivebateProvider {
Ok(items) Ok(items)
} }
fn parse_mixin_packed_eval(html: &str) -> Option<String> {
let eval_regex = Regex::new(
r#"(?s)eval\(function\(p,a,c,k,e,d\)\{.*?\}\('(?P<payload>.*?)',\s*(?P<radix>[0-9]+),\s*(?P<count>[0-9]+),\s*'(?P<tokens>.*?)'\.split\('\|'\)"#,
)
.ok()?;
let captures = eval_regex.captures(html)?;
let payload_raw = captures.name("payload")?.as_str();
let radix = captures.name("radix")?.as_str().parse::<u32>().ok()?;
let count = captures.name("count")?.as_str().parse::<usize>().ok()?;
if !(2..=36).contains(&radix) {
return None;
}
let payload = Self::unescape_js_single_quoted(payload_raw);
let tokens_raw = captures.name("tokens")?.as_str();
let tokens = tokens_raw.split('|').collect::<Vec<_>>();
let mut unpacked = payload;
for index in (0..count).rev() {
let Some(token) = tokens.get(index) else {
continue;
};
if token.is_empty() {
continue;
}
let key = Self::to_radix(index, radix);
let pattern = format!(r"\b{}\b", regex::escape(&key));
let re = Regex::new(&pattern).ok()?;
unpacked = re.replace_all(&unpacked, *token).into_owned();
}
Some(unpacked)
}
fn unescape_js_single_quoted(value: &str) -> String {
let mut output = String::with_capacity(value.len());
let mut chars = value.chars();
while let Some(character) = chars.next() {
if character != '\\' {
output.push(character);
continue;
}
let Some(next) = chars.next() else {
break;
};
match next {
'\\' => output.push('\\'),
'\'' => output.push('\''),
'"' => output.push('"'),
'n' => output.push('\n'),
'r' => output.push('\r'),
't' => output.push('\t'),
_ => output.push(next),
}
}
output
}
fn to_radix(mut value: usize, radix: u32) -> String {
if value == 0 {
return "0".to_string();
}
let alphabet = b"0123456789abcdefghijklmnopqrstuvwxyz";
let mut out = Vec::new();
while value > 0 {
let digit = value % radix as usize;
out.push(alphabet[digit] as char);
value /= radix as usize;
}
out.iter().rev().collect()
}
fn extract_mixdrop_media_url(html: &str) -> Option<String> {
let direct_regex = Regex::new(r#"MDCore\.wurl\s*=\s*"([^"]+)""#).ok()?;
if let Some(url) = direct_regex
.captures(html)
.and_then(|captures| captures.get(1).map(|value| value.as_str().to_string()))
{
return Some(Self::normalize_possible_protocol_relative(&url));
}
let unpacked = Self::parse_mixin_packed_eval(html)?;
let unpacked_regex = Regex::new(r#"MDCore\.wurl\s*=\s*"([^"]+)""#).ok()?;
unpacked_regex
.captures(&unpacked)
.and_then(|captures| captures.get(1).map(|value| value.as_str().to_string()))
.map(|value| Self::normalize_possible_protocol_relative(&value))
}
fn normalize_possible_protocol_relative(value: &str) -> String {
let trimmed = value.trim();
if trimmed.starts_with("//") {
format!("https:{trimmed}")
} else {
trimmed.to_string()
}
}
fn host_from_url(url: &str) -> Option<String> {
let parsed = url::Url::parse(url).ok()?;
parsed.host_str().map(|value| value.to_ascii_lowercase())
}
fn is_mixdrop_host(url: &str) -> bool {
let Some(host) = Self::host_from_url(url) else {
return false;
};
host.contains("mixdrop") || host.contains("m1xdrop")
}
fn first_video_source_from_html(html: &str) -> Option<String> {
let document = Html::parse_document(html);
let source_selector = Selector::parse("video source[src]").ok()?;
let video_src_selector = Selector::parse("video[src]").ok()?;
if let Some(value) = document
.select(&source_selector)
.next()
.and_then(|node| node.value().attr("src"))
{
return Some(value.to_string());
}
document
.select(&video_src_selector)
.next()
.and_then(|node| node.value().attr("src"))
.map(|value| value.to_string())
}
fn first_iframe_source_from_html(html: &str) -> Option<String> {
let document = Html::parse_document(html);
let iframe_selector = Selector::parse("iframe[src]").ok()?;
document
.select(&iframe_selector)
.next()
.and_then(|node| node.value().attr("src"))
.map(|value| value.to_string())
}
async fn resolve_mixdrop_media_from_iframe(
&self,
iframe_url: &str,
referer: &str,
options: &ServerOptions,
) -> Option<String> {
let mut requester = requester_or_default(options, CHANNEL_ID, "resolve_mixdrop_media");
let iframe_html = requester
.get_with_headers(
iframe_url,
self.html_headers(referer),
Some(wreq::Version::HTTP_11),
)
.await
.ok()?;
Self::extract_mixdrop_media_url(&iframe_html)
}
async fn enrich_video(&self, item: VideoItem, options: &ServerOptions) -> VideoItem {
let page_url = item.url.clone();
let mut requester = requester_or_default(options, CHANNEL_ID, "archivebate.enrich_video");
let detail_html = match requester
.get_with_headers(
&page_url,
self.html_headers(&format!("{}/", self.url)),
Some(wreq::Version::HTTP_11),
)
.await
{
Ok(value) => value,
Err(error) => {
report_provider_error_background(
CHANNEL_ID,
"enrich_video.fetch_detail",
&format!("url={page_url}; error={error}"),
);
return item;
}
};
let mut media_url = Self::first_video_source_from_html(&detail_html)
.map(|value| self.absolute_url(&value));
if media_url.is_none() {
let iframe_url = Self::first_iframe_source_from_html(&detail_html)
.map(|value| self.absolute_url(&value));
if let Some(iframe_url) = iframe_url {
if Self::is_mixdrop_host(&iframe_url) {
if let Some(resolved) = self
.resolve_mixdrop_media_from_iframe(&iframe_url, &page_url, options)
.await
{
media_url = Some(resolved);
}
}
}
}
let Some(media_url) = media_url else {
return item;
};
let format = VideoFormat::new(media_url, "source".to_string(), "mp4".to_string());
let mut enriched = item;
enriched.formats = Some(vec![format]);
enriched
}
fn extract_csrf_token(html: &str) -> Option<String> { fn extract_csrf_token(html: &str) -> Option<String> {
let regex = Regex::new(r#"<meta name="csrf-token" content="([^"]+)""#).ok()?; let regex = Regex::new(r#"<meta name="csrf-token" content="([^"]+)""#).ok()?;
regex regex
@@ -1047,13 +1260,35 @@ impl Provider for ArchivebateProvider {
let result = match query { let result = match query {
Some(query) if !query.trim().is_empty() => { Some(query) if !query.trim().is_empty() => {
self.query(cache, page, per_page, &query, options).await self.query(cache, page, per_page, &query, options.clone()).await
} }
_ => self.get_default(cache, page, per_page, options).await, _ => self.get_default(cache, page, per_page, options.clone()).await,
}; };
match result { match result {
Ok(videos) => videos, Ok(videos) => {
if videos.is_empty() {
return videos;
}
stream::iter(videos.into_iter().map(|video| {
let provider = self.clone();
let options = options.clone();
async move {
let timeout_result = timeout(
StdDuration::from_secs(8),
provider.enrich_video(video.clone(), &options),
)
.await;
match timeout_result {
Ok(enriched) => enriched,
Err(_) => video,
}
}
}))
.buffer_unordered(4)
.collect::<Vec<_>>()
.await
}
Err(error) => { Err(error) => {
report_provider_error(CHANNEL_ID, "get_videos", &error.to_string()).await; report_provider_error(CHANNEL_ID, "get_videos", &error.to_string()).await;
vec![] vec![]
@@ -1065,3 +1300,23 @@ impl Provider for ArchivebateProvider {
Some(self.build_channel(clientversion)) Some(self.build_channel(clientversion))
} }
} }
#[cfg(test)]
mod tests {
use super::ArchivebateProvider;
#[test]
fn extracts_mixdrop_wurl_from_packed_eval() {
let html = r#"
<script>
eval(function(p,a,c,k,e,d){e=function(c){return c};if(!''.replace(/^/,String)){while(c--){d[c]=k[c]||c}k=[function(e){return d[e]}];e=function(){return'\\w+'};c=1};while(c--){if(k[c]){p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c])}}return p}('1.2="//3.4.5/6/7.8?9=a&b=c";',13,13,'|MDCore|wurl|o230m5y6z|mxcontent|net|v2|r6pkwozjber741|mp4|s|TvNTJe3_z_6nKveumEHk8Q|e|1776460168'.split('|'),0,{}))
</script>
"#;
let actual = ArchivebateProvider::extract_mixdrop_media_url(html)
.expect("expected mixdrop media url");
assert_eq!(
actual,
"https://o230m5y6z.mxcontent.net/v2/r6pkwozjber741.mp4?s=TvNTJe3_z_6nKveumEHk8Q&e=1776460168"
);
}
}