fix: Swahili duration text parsing

fix tests
2023-05-22 17:44:14 +02:00 · 2023-05-22 15:17:05 +02:00
3 changed files with 27 additions and 6 deletions
--- a/src/client/response/video_item.rs
+++ b/src/client/response/video_item.rs
@ -531,8 +531,15 @@ impl<T> YouTubeListMapper<T> {
        });

        let length = video.accessibility.and_then(|acc| {
-            let parts = ACCESSIBILITY_SEP_REGEX.split(&acc).collect::<Vec<_>>();
-            if parts.len() > 2 {
+            // The video title has to be stripped from the beginning because in Swahili
+            // the duration follows the title with no separator (probably a bug).
+            // Example: `what I do with leftoversdakika 1 - cheza video`
+            let parts = ACCESSIBILITY_SEP_REGEX
+                .split(acc.trim_start_matches(&video.headline))
+                .collect::<Vec<_>>();
+            if parts.len() > 1 {
+                // In Russian, the duration is the last part
+                // Example: `Воспроизвести видео – \"hangover food\". Его продолжительность – 58 секунд.`
                let i = match self.lang {
                    Language::Ru => 1,
                    _ => 2,
--- a/src/util/timeago.rs
+++ b/src/util/timeago.rs
@ -344,7 +344,21 @@ struct DurationTxtSegment {
    word: String,
 }

-fn split_duration_txt(txt: &str, start_c: bool) -> Vec<DurationTxtSegment> {
+/// Split a video duration string into its segments.
+///
+/// Each segment consists of a word and a string of digits (one of them may be empty).
+///
+/// The `start_word` parameter determines whether the segments should start with a word
+/// instead of a number. This is the case in Swahili and Singhalese.
+///
+/// Example (start_word=false):
+/// - `1 minute, 13 seconds` -> `{1;minute} {13;seconds}`
+/// - `foo 1 minute, 13 seconds bar` -> `{foo} {1;minute} {13;seconds bar}`
+///
+/// Example (start_word=true):
+/// - `dakika 1 na sekunde 1` -> `{1;dakika} {1;na sekunde}`
+/// - `foo dakika 1 na sekunde 1 bar` -> `{1;foo dakika} {1;na sekunde} {bar}`
+fn split_duration_txt(txt: &str, start_word: bool) -> Vec<DurationTxtSegment> {
    let mut segments = Vec::new();

    // 1: parse digits, 2: parse word
@ -353,14 +367,14 @@ fn split_duration_txt(txt: &str, start_c: bool) -> Vec<DurationTxtSegment> {

    for c in txt.trim().chars() {
        if c.is_ascii_digit() {
-            if state == 2 && (!seg.digits.is_empty() || (!start_c && segments.is_empty())) {
+            if state == 2 && (!seg.digits.is_empty() || (!start_word && segments.is_empty())) {
                segments.push(seg);
                seg = DurationTxtSegment::default();
            }
            seg.digits.push(c);
            state = 1;
        } else {
-            if (state == 1) && (!seg.word.is_empty() || (start_c && segments.is_empty())) {
+            if (state == 1) && (!seg.word.is_empty() || (start_word && segments.is_empty())) {
                segments.push(seg);
                seg = DurationTxtSegment::default();
            }
--- a/tests/youtube.rs
+++ b/tests/youtube.rs
@ -1001,7 +1001,7 @@ fn channel_order(
    ))
    .unwrap();
    // Upload dates should be in descending order
-    if tab != ChannelVideoTab::Shorts {
+    if tab == ChannelVideoTab::Videos {
        let mut latest_items = latest.items.iter().peekable();
        while let (Some(v), Some(next_v)) = (latest_items.next(), latest_items.peek()) {
            if !v.is_upcoming && !v.is_live && !next_v.is_upcoming && !next_v.is_live {
Author	SHA1	Message	Date
ThetaDev	da8b2a27fc	fix: Swahili duration text parsing All checks were successful ci/woodpecker/push/woodpecker Pipeline was successful Details	2023-05-22 17:44:14 +02:00
ThetaDev	2c4d70cc0d	fix tests	2023-05-22 15:17:05 +02:00