Compare commits

..

2 commits

Author SHA1 Message Date
e5c51fe995 fix: extract visitor data from html page
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
2023-08-03 19:31:34 +02:00
ed84f72ace fix: hold back regex crate (v1.9.0 causes issues)
Reported issue: https://github.com/rust-lang/regex/issues/1060
2023-08-03 18:29:33 +02:00
3 changed files with 45 additions and 25 deletions

View file

@ -31,7 +31,7 @@ quick-js-dtp = { version = "0.4.1", default-features = false, features = [
"patch-dateparser", "patch-dateparser",
] } ] }
once_cell = "1.12.0" once_cell = "1.12.0"
regex = "1.6.0" regex = ">=1.6.0, <1.9.0"
fancy-regex = "0.11.0" fancy-regex = "0.11.0"
thiserror = "1.0.36" thiserror = "1.0.36"
url = "2.2.2" url = "2.2.2"

View file

@ -209,8 +209,10 @@ const ANDROID_API_KEY: &str = "AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w";
const IOS_API_KEY: &str = "AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc"; const IOS_API_KEY: &str = "AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc";
const IOS_DEVICE_MODEL: &str = "iPhone14,5"; const IOS_DEVICE_MODEL: &str = "iPhone14,5";
static CLIENT_VERSION_REGEXES: Lazy<[Regex; 1]> = static CLIENT_VERSION_REGEX: Lazy<Regex> =
Lazy::new(|| [Regex::new(r#"INNERTUBE_CONTEXT_CLIENT_VERSION":"([\w\d\._-]+?)""#).unwrap()]); Lazy::new(|| Regex::new(r#""INNERTUBE_CONTEXT_CLIENT_VERSION":"([\w\d\._-]+?)""#).unwrap());
static VISITOR_DATA_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r#""visitorData":"([\w\d_\-%]+?)""#).unwrap());
/// The RustyPipe client used to access YouTube's API /// The RustyPipe client used to access YouTube's API
/// ///
@ -814,11 +816,11 @@ impl RustyPipe {
) )
.await?; .await?;
util::get_cg_from_regexes(CLIENT_VERSION_REGEXES.iter(), &swjs, 1).ok_or( util::get_cg_from_regex(&CLIENT_VERSION_REGEX, &swjs, 1).ok_or(Error::Extraction(
Error::Extraction(ExtractionError::InvalidData(Cow::Borrowed( ExtractionError::InvalidData(Cow::Borrowed(
"Could not find client version in sw.js", "Could not find client version in sw.js",
))), )),
) ))
}); });
let from_html = async { let from_html = async {
@ -829,11 +831,11 @@ impl RustyPipe {
let html = self.http_request_txt(&builder.build().unwrap()).await?; let html = self.http_request_txt(&builder.build().unwrap()).await?;
util::get_cg_from_regexes(CLIENT_VERSION_REGEXES.iter(), &html, 1).ok_or( util::get_cg_from_regex(&CLIENT_VERSION_REGEX, &html, 1).ok_or(Error::Extraction(
Error::Extraction(ExtractionError::InvalidData(Cow::Borrowed( ExtractionError::InvalidData(Cow::Borrowed(
"Could not find client version on html page", "Could not find client version on html page",
))), )),
) ))
}; };
if let Some(from_swjs) = from_swjs { if let Some(from_swjs) = from_swjs {
@ -965,11 +967,15 @@ impl RustyPipe {
/// ///
/// Since the cookie is shared between YT and YTM and the YTM page loads faster, /// Since the cookie is shared between YT and YTM and the YTM page loads faster,
/// we request that. /// we request that.
///
/// Sometimes YouTube does not set the `__Secure-YEC` cookie. In this case, the
/// visitor data is extracted from the html page.
async fn get_visitor_data(&self) -> Result<String, Error> { async fn get_visitor_data(&self) -> Result<String, Error> {
log::debug!("getting YT visitor data"); log::debug!("getting YT visitor data");
let resp = self.inner.http.get(YOUTUBE_MUSIC_HOME_URL).send().await?; let resp = self.inner.http.get(YOUTUBE_MUSIC_HOME_URL).send().await?;
resp.headers() let vdata = resp
.headers()
.get_all(header::SET_COOKIE) .get_all(header::SET_COOKIE)
.iter() .iter()
.find_map(|c| { .find_map(|c| {
@ -979,10 +985,27 @@ impl RustyPipe {
} }
} }
None None
}) });
.ok_or(Error::Extraction(ExtractionError::InvalidData(
Cow::Borrowed("could not get YTM cookies"), match vdata {
))) Some(vdata) => Ok(vdata),
None => {
if resp.status().is_success() {
// Extract visitor data from html
let html = resp.text().await?;
util::get_cg_from_regex(&VISITOR_DATA_REGEX, &html, 1).ok_or(Error::Extraction(
ExtractionError::InvalidData(Cow::Borrowed(
"Could not find visitor data on html page",
)),
))
} else {
Err(Error::Extraction(ExtractionError::InvalidData(
format!("Could not get visitor data, status: {}", resp.status()).into(),
)))
}
}
}
} }
} }

View file

@ -41,14 +41,11 @@ pub const ARTIST_DISCOGRAPHY_PREFIX: &str = "MPAD";
const CONTENT_PLAYBACK_NONCE_ALPHABET: &[u8; 64] = const CONTENT_PLAYBACK_NONCE_ALPHABET: &[u8; 64] =
b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
/// Return the given capture group that matches first in a list of regexes /// Return the given capture group that matches the regex
pub fn get_cg_from_regexes<'a, I>(mut regexes: I, text: &str, cg: usize) -> Option<String> pub fn get_cg_from_regex(regex: &Regex, text: &str, cg: usize) -> Option<String> {
where regex
I: Iterator<Item = &'a Regex>, .captures(text)
{ .and_then(|c| c.get(cg).map(|c| c.as_str().to_owned()))
regexes
.find_map(|pattern| pattern.captures(text))
.map(|c| c.get(cg).unwrap().as_str().to_owned())
} }
/// Return the given capture group that matches first in a list of fancy regexes /// Return the given capture group that matches first in a list of fancy regexes
@ -58,7 +55,7 @@ where
{ {
regexes regexes
.find_map(|pattern| pattern.captures(text).ok().flatten()) .find_map(|pattern| pattern.captures(text).ok().flatten())
.map(|c| c.get(cg).unwrap().as_str().to_owned()) .and_then(|c| c.get(cg).map(|c| c.as_str().to_owned()))
} }
/// Generate a random string with given length and byte charset. /// Generate a random string with given length and byte charset.