Compare commits

..

2 commits

Author SHA1 Message Date
e5c51fe995 fix: extract visitor data from html page
Some checks failed
ci/woodpecker/push/woodpecker Pipeline failed
2023-08-03 19:31:34 +02:00
ed84f72ace fix: hold back regex crate (v1.9.0 causes issues)
Reported issue: https://github.com/rust-lang/regex/issues/1060
2023-08-03 18:29:33 +02:00
3 changed files with 45 additions and 25 deletions

View file

@ -31,7 +31,7 @@ quick-js-dtp = { version = "0.4.1", default-features = false, features = [
"patch-dateparser",
] }
once_cell = "1.12.0"
regex = "1.6.0"
regex = ">=1.6.0, <1.9.0"
fancy-regex = "0.11.0"
thiserror = "1.0.36"
url = "2.2.2"

View file

@ -209,8 +209,10 @@ const ANDROID_API_KEY: &str = "AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w";
const IOS_API_KEY: &str = "AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc";
const IOS_DEVICE_MODEL: &str = "iPhone14,5";
static CLIENT_VERSION_REGEXES: Lazy<[Regex; 1]> =
Lazy::new(|| [Regex::new(r#"INNERTUBE_CONTEXT_CLIENT_VERSION":"([\w\d\._-]+?)""#).unwrap()]);
static CLIENT_VERSION_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r#""INNERTUBE_CONTEXT_CLIENT_VERSION":"([\w\d\._-]+?)""#).unwrap());
static VISITOR_DATA_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r#""visitorData":"([\w\d_\-%]+?)""#).unwrap());
/// The RustyPipe client used to access YouTube's API
///
@ -814,11 +816,11 @@ impl RustyPipe {
)
.await?;
util::get_cg_from_regexes(CLIENT_VERSION_REGEXES.iter(), &swjs, 1).ok_or(
Error::Extraction(ExtractionError::InvalidData(Cow::Borrowed(
util::get_cg_from_regex(&CLIENT_VERSION_REGEX, &swjs, 1).ok_or(Error::Extraction(
ExtractionError::InvalidData(Cow::Borrowed(
"Could not find client version in sw.js",
))),
)
)),
))
});
let from_html = async {
@ -829,11 +831,11 @@ impl RustyPipe {
let html = self.http_request_txt(&builder.build().unwrap()).await?;
util::get_cg_from_regexes(CLIENT_VERSION_REGEXES.iter(), &html, 1).ok_or(
Error::Extraction(ExtractionError::InvalidData(Cow::Borrowed(
util::get_cg_from_regex(&CLIENT_VERSION_REGEX, &html, 1).ok_or(Error::Extraction(
ExtractionError::InvalidData(Cow::Borrowed(
"Could not find client version on html page",
))),
)
)),
))
};
if let Some(from_swjs) = from_swjs {
@ -965,11 +967,15 @@ impl RustyPipe {
///
/// Since the cookie is shared between YT and YTM and the YTM page loads faster,
/// we request that.
///
/// Sometimes YouTube does not set the `__Secure-YEC` cookie. In this case, the
/// visitor data is extracted from the html page.
async fn get_visitor_data(&self) -> Result<String, Error> {
log::debug!("getting YT visitor data");
let resp = self.inner.http.get(YOUTUBE_MUSIC_HOME_URL).send().await?;
resp.headers()
let vdata = resp
.headers()
.get_all(header::SET_COOKIE)
.iter()
.find_map(|c| {
@ -979,11 +985,28 @@ impl RustyPipe {
}
}
None
})
.ok_or(Error::Extraction(ExtractionError::InvalidData(
Cow::Borrowed("could not get YTM cookies"),
});
match vdata {
Some(vdata) => Ok(vdata),
None => {
if resp.status().is_success() {
// Extract visitor data from html
let html = resp.text().await?;
util::get_cg_from_regex(&VISITOR_DATA_REGEX, &html, 1).ok_or(Error::Extraction(
ExtractionError::InvalidData(Cow::Borrowed(
"Could not find visitor data on html page",
)),
))
} else {
Err(Error::Extraction(ExtractionError::InvalidData(
format!("Could not get visitor data, status: {}", resp.status()).into(),
)))
}
}
}
}
}
impl RustyPipeQuery {

View file

@ -41,14 +41,11 @@ pub const ARTIST_DISCOGRAPHY_PREFIX: &str = "MPAD";
const CONTENT_PLAYBACK_NONCE_ALPHABET: &[u8; 64] =
b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
/// Return the given capture group that matches first in a list of regexes
pub fn get_cg_from_regexes<'a, I>(mut regexes: I, text: &str, cg: usize) -> Option<String>
where
I: Iterator<Item = &'a Regex>,
{
regexes
.find_map(|pattern| pattern.captures(text))
.map(|c| c.get(cg).unwrap().as_str().to_owned())
/// Return the given capture group that matches the regex
pub fn get_cg_from_regex(regex: &Regex, text: &str, cg: usize) -> Option<String> {
regex
.captures(text)
.and_then(|c| c.get(cg).map(|c| c.as_str().to_owned()))
}
/// Return the given capture group that matches first in a list of fancy regexes
@ -58,7 +55,7 @@ where
{
regexes
.find_map(|pattern| pattern.captures(text).ok().flatten())
.map(|c| c.get(cg).unwrap().as_str().to_owned())
.and_then(|c| c.get(cg).map(|c| c.as_str().to_owned()))
}
/// Generate a random string with given length and byte charset.