From ed84f72aced948b905e9680962d920b835e3048a Mon Sep 17 00:00:00 2001 From: ThetaDev Date: Thu, 3 Aug 2023 18:29:33 +0200 Subject: [PATCH 1/2] fix: hold back regex crate (v1.9.0 causes issues) Reported issue: https://github.com/rust-lang/regex/issues/1060 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 708866b..e1900b3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ quick-js-dtp = { version = "0.4.1", default-features = false, features = [ "patch-dateparser", ] } once_cell = "1.12.0" -regex = "1.6.0" +regex = ">=1.6.0, <1.9.0" fancy-regex = "0.11.0" thiserror = "1.0.36" url = "2.2.2" From e5c51fe99592519bca5500ce4082aad4a0cb8b7d Mon Sep 17 00:00:00 2001 From: ThetaDev Date: Thu, 3 Aug 2023 19:31:34 +0200 Subject: [PATCH 2/2] fix: extract visitor data from html page --- src/client/mod.rs | 53 +++++++++++++++++++++++++++++++++-------------- src/util/mod.rs | 15 ++++++-------- 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/src/client/mod.rs b/src/client/mod.rs index 50c7f53..bf5413e 100644 --- a/src/client/mod.rs +++ b/src/client/mod.rs @@ -209,8 +209,10 @@ const ANDROID_API_KEY: &str = "AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w"; const IOS_API_KEY: &str = "AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc"; const IOS_DEVICE_MODEL: &str = "iPhone14,5"; -static CLIENT_VERSION_REGEXES: Lazy<[Regex; 1]> = - Lazy::new(|| [Regex::new(r#"INNERTUBE_CONTEXT_CLIENT_VERSION":"([\w\d\._-]+?)""#).unwrap()]); +static CLIENT_VERSION_REGEX: Lazy = + Lazy::new(|| Regex::new(r#""INNERTUBE_CONTEXT_CLIENT_VERSION":"([\w\d\._-]+?)""#).unwrap()); +static VISITOR_DATA_REGEX: Lazy = + Lazy::new(|| Regex::new(r#""visitorData":"([\w\d_\-%]+?)""#).unwrap()); /// The RustyPipe client used to access YouTube's API /// @@ -814,11 +816,11 @@ impl RustyPipe { ) .await?; - util::get_cg_from_regexes(CLIENT_VERSION_REGEXES.iter(), &swjs, 1).ok_or( - Error::Extraction(ExtractionError::InvalidData(Cow::Borrowed( + util::get_cg_from_regex(&CLIENT_VERSION_REGEX, &swjs, 1).ok_or(Error::Extraction( + ExtractionError::InvalidData(Cow::Borrowed( "Could not find client version in sw.js", - ))), - ) + )), + )) }); let from_html = async { @@ -829,11 +831,11 @@ impl RustyPipe { let html = self.http_request_txt(&builder.build().unwrap()).await?; - util::get_cg_from_regexes(CLIENT_VERSION_REGEXES.iter(), &html, 1).ok_or( - Error::Extraction(ExtractionError::InvalidData(Cow::Borrowed( + util::get_cg_from_regex(&CLIENT_VERSION_REGEX, &html, 1).ok_or(Error::Extraction( + ExtractionError::InvalidData(Cow::Borrowed( "Could not find client version on html page", - ))), - ) + )), + )) }; if let Some(from_swjs) = from_swjs { @@ -965,11 +967,15 @@ impl RustyPipe { /// /// Since the cookie is shared between YT and YTM and the YTM page loads faster, /// we request that. + /// + /// Sometimes YouTube does not set the `__Secure-YEC` cookie. In this case, the + /// visitor data is extracted from the html page. async fn get_visitor_data(&self) -> Result { log::debug!("getting YT visitor data"); let resp = self.inner.http.get(YOUTUBE_MUSIC_HOME_URL).send().await?; - resp.headers() + let vdata = resp + .headers() .get_all(header::SET_COOKIE) .iter() .find_map(|c| { @@ -979,10 +985,27 @@ impl RustyPipe { } } None - }) - .ok_or(Error::Extraction(ExtractionError::InvalidData( - Cow::Borrowed("could not get YTM cookies"), - ))) + }); + + match vdata { + Some(vdata) => Ok(vdata), + None => { + if resp.status().is_success() { + // Extract visitor data from html + let html = resp.text().await?; + + util::get_cg_from_regex(&VISITOR_DATA_REGEX, &html, 1).ok_or(Error::Extraction( + ExtractionError::InvalidData(Cow::Borrowed( + "Could not find visitor data on html page", + )), + )) + } else { + Err(Error::Extraction(ExtractionError::InvalidData( + format!("Could not get visitor data, status: {}", resp.status()).into(), + ))) + } + } + } } } diff --git a/src/util/mod.rs b/src/util/mod.rs index ad881e3..363d48d 100644 --- a/src/util/mod.rs +++ b/src/util/mod.rs @@ -41,14 +41,11 @@ pub const ARTIST_DISCOGRAPHY_PREFIX: &str = "MPAD"; const CONTENT_PLAYBACK_NONCE_ALPHABET: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; -/// Return the given capture group that matches first in a list of regexes -pub fn get_cg_from_regexes<'a, I>(mut regexes: I, text: &str, cg: usize) -> Option -where - I: Iterator, -{ - regexes - .find_map(|pattern| pattern.captures(text)) - .map(|c| c.get(cg).unwrap().as_str().to_owned()) +/// Return the given capture group that matches the regex +pub fn get_cg_from_regex(regex: &Regex, text: &str, cg: usize) -> Option { + regex + .captures(text) + .and_then(|c| c.get(cg).map(|c| c.as_str().to_owned())) } /// Return the given capture group that matches first in a list of fancy regexes @@ -58,7 +55,7 @@ where { regexes .find_map(|pattern| pattern.captures(text).ok().flatten()) - .map(|c| c.get(cg).unwrap().as_str().to_owned()) + .and_then(|c| c.get(cg).map(|c| c.as_str().to_owned())) } /// Generate a random string with given length and byte charset.