fix: improve capitalization, handle multiple iteration marks

fix: tweaks to normalization
2022-11-27 00:33:49 +01:00 · 2022-11-26 23:40:45 +01:00
2 changed files with 128 additions and 74 deletions
--- a/src/lib.rs
+++ b/src/lib.rs
@ -6,8 +6,6 @@ mod util;

 pub use types::KakasiResult;

-use std::borrow::Cow;
-
 use unicode_normalization::UnicodeNormalization;

 use phfbin::PhfMap;
@ -16,68 +14,67 @@ use types::{CharType, KanjiString, Readings};
 pub fn convert(text: &str) -> KakasiResult {
    let dict = PhfMap::new(util::KANJI_DICT);

-    // TODO: char conversion should be done with iterators
-    let text = text.nfkc().collect::<String>();
-    let text = convert_syn(&text);
+    let text = normalize(&text);

    let mut char_indices = text.char_indices().peekable();
    let mut kana_buf = String::new();
    let mut prev_buf_type = CharType::Whitespace;
    let mut prev_acc_type = CharType::Whitespace;
-    let mut cap = (false, false);
+    // Capitalization flags
+    // 0: capitalize next word, 1: capitalize first sentence, 2: first sentence capitalized
+    let mut cap = (false, false, false);

    let mut res = KakasiResult::default();

    let conv_kana_buf = |kana_buf: &mut String,
                         res: &mut KakasiResult,
-                         prev_type: CharType,
-                         cap: &mut (bool, bool)| {
+                         prev_type: &mut CharType,
+                         cap: &mut (bool, bool, bool)| {
        if !kana_buf.is_empty() {
            let hira = convert_katakana(kana_buf);
            res.hiragana.push_str(&hira);
            let mut rom = hiragana_to_romaji(&hira);

            if cap.0 {
-                let done;
-                (rom, done) = util::capitalize_first_c(&rom);
-                cap.0 = !done;
-
-                if !cap.1 {
-                    (res.romaji, _) = util::capitalize_first_c(&res.romaji);
-                    cap.1 = true;
-                }
+                rom = util::capitalize_first_c(&rom);
+                cap.0 = false;
+            }
+            if cap.1 && !cap.2 {
+                res.romaji = util::capitalize_first_c(&res.romaji);
+                cap.2 = true;
            }

            util::ensure_trailing_space(
                &mut res.romaji,
-                prev_type != CharType::LeadingPunct && prev_type != CharType::JoiningPunct,
+                *prev_type != CharType::LeadingPunct && *prev_type != CharType::JoiningPunct,
            );
            res.romaji.push_str(&rom);

            kana_buf.clear();
+            *prev_type = CharType::Hiragana;
        }
    };

    while let Some((i, c)) = char_indices.next() {
        if util::is_char_in_range(c, util::HIRAGANA) {
            if prev_buf_type != CharType::Hiragana {
-                conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap);
+                conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
            }
            kana_buf.push(c);
            prev_buf_type = CharType::Hiragana;
        } else if util::is_char_in_range(c, util::KATAKANA) {
            if prev_buf_type != CharType::Katakana {
-                conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap);
+                conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
            }
            kana_buf.push(c);
            prev_buf_type = CharType::Katakana;
        } else if util::is_char_in_range(c, util::KANJI) {
            let (t, n) = convert_kanji(&text[i..], &kana_buf, &dict);
-            conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap);
+            conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);

            if n > 0 {
                kana_buf = t;
-                conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap);
+                conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
                for _ in 1..n {
                    char_indices.next();
                }
@ -89,18 +86,18 @@ pub fn convert(text: &str) -> KakasiResult {
            }
            prev_acc_type = CharType::Kanji;
        } else if c.is_whitespace() {
-            conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap);
+            conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
            res.hiragana.push(c);
            res.romaji.push(c);
            prev_acc_type = CharType::Whitespace;
        } else if c == '・' {
-            conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap);
+            conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
            res.hiragana.push(c);
            res.romaji.push(' ');
            prev_acc_type = CharType::Whitespace;
        } else if c == util::PROLONGED_SOUND_MARK {
            if prev_buf_type != CharType::Hiragana && prev_buf_type != CharType::Katakana {
-                conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap);
+                conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
            }
            kana_buf.push(c);
            prev_buf_type = match prev_buf_type {
@ -108,7 +105,7 @@ pub fn convert(text: &str) -> KakasiResult {
                _ => CharType::Katakana,
            };
        } else {
-            conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap);
+            conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);

            res.hiragana.push(c);
            let (c_rom, char_type) = util::PCT_DICT.get(&c).copied().unwrap_or_else(|| {
@ -129,8 +126,8 @@ pub fn convert(text: &str) -> KakasiResult {
                )
            });

-            if (prev_acc_type != CharType::Other && prev_acc_type != CharType::Numeric)
-                || util::is_char_japanese_punctuation(c)
+            let is_jpunct = util::is_char_japanese_punctuation(c);
+            if (prev_acc_type != CharType::Other && prev_acc_type != CharType::Numeric) || is_jpunct
            {
                util::ensure_trailing_space(
                    &mut res.romaji,
@ -140,17 +137,24 @@ pub fn convert(text: &str) -> KakasiResult {
                        && char_type != CharType::JoiningPunct,
                );
            }
-            res.romaji.push(c_rom);

-            if c_rom == '.' && char_type != CharType::Numeric {
-                cap.0 = true;
+            // Japanese punctuation was not normalized at the beginning,
+            // the normalization here will replace fullwidth characters with normal ones.
+            if is_jpunct && char_type == CharType::Other {
+                res.romaji.extend(c_rom.nfkc());
+            } else {
+                res.romaji.push(c_rom);
            }

+            cap.0 = c_rom == '.' && char_type != CharType::Numeric
+                || cap.0 && matches!(char_type, CharType::LeadingPunct | CharType::JoiningPunct);
+            cap.1 |= cap.0;
+
            prev_acc_type = char_type;
        };
    }

-    conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap);
+    conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
    res
 }

@ -304,43 +308,56 @@ fn convert_kanji(text: &str, btext: &str, dict: &PhfMap) -> (String, usize) {
    translation.map(|tl| (tl, n_c)).unwrap_or_default()
 }

-/// Convert all synonymous kanji and replace iteration characters (`々`)
-///
-/// The input text needs to be NFKC-normalized.
-fn convert_syn(text: &str) -> Cow<str> {
-    let mut replacements = text
-        .char_indices()
-        .filter_map(|(i, c)| {
+/// NFKC-normalize the text, convert all synonymous kanji
+/// and replace iteration marks (`々`)
+fn normalize(text: &str) -> String {
+    let mut imcount = 0;
+    let replacements = text.char_indices().filter_map(|(i, c)| {
+        if c == util::ITERATION_MARK {
+            // Count iteration marks
+            if imcount == 0 {
+                imcount = 1;
+                for c in text[i + c.len_utf8()..].chars() {
+                    if c == util::ITERATION_MARK {
+                        imcount += 1;
+                    } else {
+                        break;
+                    }
+                }
+            }
+
+            // Replace withe the character imcount positions before
+            let mut chars_rev = text[0..i].chars().rev();
+            for _ in 1..imcount {
+                chars_rev.next();
+            }
+            chars_rev.next().map(|prev| (i, c.len_utf8(), prev))
+        } else {
+            imcount = 0;
            syn_dict::SYN_DICT
                .get(&c)
                .map(|r_char| (i, c.len_utf8(), *r_char))
                .or_else(|| {
-                    if c == util::ITERATION_MARK {
-                        text[0..i]
-                            .chars()
-                            .last()
-                            .map(|prev| (i, c.len_utf8(), prev))
+                    // Dont normalize japanese punctuation, we need it to add correct spacing
+                    if util::is_char_fwidth_punctuation(c) {
+                        Some((i, c.len_utf8(), c))
                    } else {
                        None
                    }
                })
-        })
-        .peekable();
-
-    if replacements.peek().is_none() {
-        return Cow::Borrowed(text);
-    }
+        }
+    });

    let mut new = String::with_capacity(text.len());
    let mut last = 0;

    for (i, clen, r_char) in replacements {
-        new.push_str(&text[last..i]);
+        new.extend(text[last..i].nfkc());
        new.push(r_char);
        last = i + clen;
    }
-    new.push_str(&text[last..]);
-    Cow::Owned(new)
+    new.extend(text[last..].nfkc());
+    new
 }

 #[cfg(test)]
@ -354,7 +371,7 @@ mod tests {
    #[case("…", "...")]
    #[case("‥", "..")]
    #[case("\u{FF70}", "\u{30FC}")]
-    fn t_normalize(#[case] text: &str, #[case] expect: &str) {
+    fn t_unicode_nfkc(#[case] text: &str, #[case] expect: &str) {
        let res = text.nfkc().collect::<String>();
        assert_eq!(res, expect);
    }
@ -363,8 +380,8 @@ mod tests {
    #[case("壱意", "一意")]
    #[case("", "")]
    #[case("Abc", "Abc")]
-    fn t_convert_syn(#[case] text: &str, #[case] expect: &str) {
-        let res = convert_syn(text);
+    fn t_normalize(#[case] text: &str, #[case] expect: &str) {
+        let res = normalize(text);
        assert_eq!(res, expect);
    }

@ -455,7 +472,7 @@ mod tests {
    #[case("アヷーリヤ品", "あゔぁーりやひん", "avaariya hin")]
    #[case(
        "安藤 和風（あんどう はるかぜ、慶応2年1月12日（1866年2月26日） - 昭和11年（1936年）12月26日）は、日本のジャーナリスト、マスメディア経営者、俳人、郷土史研究家。通名および俳号は「和風」をそのまま音読みして「わふう」。秋田県の地方紙「秋田魁新報」の事業拡大に貢献し、秋田魁新報社三大柱石の一人と称された。「魁の安藤か、安藤の魁か」と言われるほど、新聞記者としての名声を全国にとどろかせた[4]。",
-        "あんどう わふう(あんどう はるかぜ、けいおう2ねん1がつ12にち(1866ねん2がつ26にち) - しょうわ11ねん(1936ねん)12がつ26にち)は、にっぽんのじゃーなりすと、ますめでぃあけいえいしゃ、はいじん、きょうどしけんきゅうか。とおりめいおよびはいごうは「わふう」をそのままおんよみして「わふう」。あきたけんのちほうし「あきたかいしんぽう」のじぎょうかくだいにこうけんし、あきたかいしんぽうしゃさんだいちゅうせきのひとりとしょうされた。「かいのあんどうか、あんどうのかいか」といわれるほど、しんぶんきしゃとしてのめいせいをぜんこくにとどろかせた[4]。",
+        "あんどう わふう（あんどう はるかぜ、けいおう2ねん1がつ12にち（1866ねん2がつ26にち） - しょうわ11ねん（1936ねん）12がつ26にち）は、にっぽんのじゃーなりすと、ますめでぃあけいえいしゃ、はいじん、きょうどしけんきゅうか。とおりめいおよびはいごうは「わふう」をそのままおんよみして「わふう」。あきたけんのちほうし「あきたかいしんぽう」のじぎょうかくだいにこうけんし、あきたかいしんぽうしゃさんだいちゅうせきのひとりとしょうされた。「かいのあんどうか、あんどうのかいか」といわれるほど、しんぶんきしゃとしてのめいせいをぜんこくにとどろかせた[4]。",
        "Andou wafuu (andou harukaze, keiou 2 nen 1 gatsu 12 nichi (1866 nen 2 gatsu 26 nichi) - shouwa 11 nen (1936 nen) 12 gatsu 26 nichi) ha, nippon no jaanarisuto, masumedia keieisha, haijin, kyoudoshi kenkyuuka. Toori mei oyobi hai gou ha \"wafuu\" wosonomama onyomi shite \"wafuu\". Akitaken no chihoushi \"akita kai shinpou\" no jigyou kakudai ni kouken shi, akita kai shinpou sha sandai chuuseki no hitori to shousa reta. \"Kai no andou ka, andou no kai ka\" to iwa reruhodo, shinbunkisha toshiteno meisei wo zenkoku nitodorokaseta [4].",
    )]
    #[case(
@ -465,12 +482,12 @@ mod tests {
    )]
    #[case(
        "緑黄色社会『ミチヲユケ』Official Video -「ファーストペンギン！」主題歌",
-        "みどりきいろしゃかい『みちをゆけ』Official Video -「ふぁーすとぺんぎん!」しゅだいか",
+        "みどりきいろしゃかい『みちをゆけ』Official Video -「ふぁーすとぺんぎん！」しゅだいか",
        "midori kiiro shakai \"michiwoyuke\" Official Video - \"faasutopengin!\" shudaika"
    )]
    #[case(
        "MONKEY MAJIK - Running In The Dark【Lyric Video】（日本語字幕付）",
-        "MONKEY MAJIK - Running In The Dark【Lyric Video】(にほんごじまくつき)",
+        "MONKEY MAJIK - Running In The Dark【Lyric Video】（にほんごじまくつき）",
        "MONKEY MAJIK - Running In The Dark [Lyric Video] (nihongo jimaku tsuki)"
    )]
    #[case(
@ -478,6 +495,23 @@ mod tests {
        "とりしまりやくだいにせいさくぎじゅつぶぶちょう",
        "torishimariyaku daini seisaku gijutsubu buchou"
    )]
+    #[case(
+        "最初の安定版である1.0版がリリ",
+        "さいしょのあんていはんである1.0はんがりり",
+        "saisho no antei han dearu 1.0 han ga riri"
+    )]
+    #[case("にゃ＄にゃ", "にゃ＄にゃ", "nya $ nya")]
+    #[case(
+        "安定版となるRust 1.0がリリースされた[84]。1.0版の後、安定版およびベータ版が6週間おきに定期リリースされている[85]。",
+        "あんていはんとなるRust 1.0がりりーすされた[84]。1.0はんののち、あんていはんおよびべーたはんが6しゅうかんおきにていきりりーすされている[85]。",
+        "Antei han tonaru Rust 1.0 ga ririisu sareta [84]. 1.0 han no nochi, antei han oyobi beeta han ga 6 shuukan okini teiki ririisu sareteiru [85]."
+    )]
+    #[case(
+        "prelude文にTryIntoやTryFrom",
+        "preludeぶんにTryIntoやTryFrom",
+        "prelude bun ni TryInto ya TryFrom"
+    )]
+    #[case("要所々々", "ようしょようしょ", "yousho yousho")]
    fn romanize(#[case] text: &str, #[case] hiragana: &str, #[case] romaji: &str) {
        let res = convert(text);
        assert_eq!(res.hiragana, hiragana);
--- a/src/util.rs
+++ b/src/util.rs
@ -51,19 +51,18 @@ pub static PCT_DICT: phf::Map<char, (char, CharType)> = phf::phf_map!(
    '〙' => (')', CharType::TrailingPunct),
    '〝' => ('"', CharType::LeadingPunct),
    '〟' => ('"', CharType::TrailingPunct),
-
-    '.' => ('.', CharType::TrailingPunct),
-    ',' => (',', CharType::TrailingPunct),
-    ':' => (':', CharType::TrailingPunct),
-    ';' => (';', CharType::TrailingPunct),
-    '!' => ('!', CharType::TrailingPunct),
-    '?' => ('?', CharType::TrailingPunct),
-    ')' => (')', CharType::TrailingPunct),
-    ']' => (']', CharType::TrailingPunct),
-    '}' => ('}', CharType::TrailingPunct),
-    '(' => ('(', CharType::LeadingPunct),
-    '[' => ('[', CharType::LeadingPunct),
-    '{' => ('{', CharType::LeadingPunct),
+    '：' => (':', CharType::TrailingPunct),
+    '；' => (';', CharType::TrailingPunct),
+    '！' => ('!', CharType::TrailingPunct),
+    '？' => ('?', CharType::TrailingPunct),
+    '＃' => ('?', CharType::LeadingPunct),
+    '）' => (')', CharType::TrailingPunct),
+    '］' => (']', CharType::TrailingPunct),
+    '｝' => ('}', CharType::TrailingPunct),
+    '（' => ('(', CharType::LeadingPunct),
+    '［' => ('[', CharType::LeadingPunct),
+    '｛' => ('{', CharType::LeadingPunct),
+    '＿' => ('{', CharType::JoiningPunct),
 );

 pub const HIRAGANA: (u32, u32) = (0x3041, 0x3096);
@ -76,11 +75,26 @@ pub const PROLONGED_SOUND_MARK: char = 'ー';
 const CJK_SYMBOLS_PUNCTUATION: (u32, u32) = (0x3000, 0x303F);
 const KANA_PUNCTUATION: (u32, u32) = (0xFF61, 0xFF65);
 const KATAKANA_PUNCTUATION: (u32, u32) = (0x30FB, 0x30FC);
+pub const ZENKAKU_PUNCTUATION_1: (u32, u32) = (0xFF01, 0xFF0F);
+pub const ZENKAKU_PUNCTUATION_2: (u32, u32) = (0xFF1A, 0xFF1F);
+pub const ZENKAKU_PUNCTUATION_3: (u32, u32) = (0xFF3B, 0xFF3F);
+pub const ZENKAKU_PUNCTUATION_4: (u32, u32) = (0xFF5B, 0xFF60);

-const JA_PUNCTUATION_RANGES: [(u32, u32); 3] = [
+const JA_PUNCTUATION_RANGES: [(u32, u32); 7] = [
    CJK_SYMBOLS_PUNCTUATION,
    KANA_PUNCTUATION,
    KATAKANA_PUNCTUATION,
+    ZENKAKU_PUNCTUATION_1,
+    ZENKAKU_PUNCTUATION_2,
+    ZENKAKU_PUNCTUATION_3,
+    ZENKAKU_PUNCTUATION_4,
+];
+
+const FW_PUNCTUATION_RANGES: [(u32, u32); 4] = [
+    ZENKAKU_PUNCTUATION_1,
+    ZENKAKU_PUNCTUATION_2,
+    ZENKAKU_PUNCTUATION_3,
+    ZENKAKU_PUNCTUATION_4,
 ];

 pub fn is_char_in_range(c: char, range: (u32, u32)) -> bool {
@ -93,7 +107,13 @@ pub fn is_char_japanese_punctuation(c: char) -> bool {
        .any(|r| is_char_in_range(c, *r))
 }

-pub fn capitalize_first_c(text: &str) -> (String, bool) {
+pub fn is_char_fwidth_punctuation(c: char) -> bool {
+    FW_PUNCTUATION_RANGES
+        .iter()
+        .any(|r| is_char_in_range(c, *r))
+}
+
+pub fn capitalize_first_c(text: &str) -> String {
    let mut done = false;
    let res = text
        .chars()
@ -106,7 +126,7 @@ pub fn capitalize_first_c(text: &str) -> (String, bool) {
            }
        })
        .collect::<String>();
-    (res, done)
+    res
 }

 pub fn ensure_trailing_space(text: &mut String, ts: bool) {
Author	SHA1	Message	Date
ThetaDev	967d1cdd62	fix: improve capitalization, handle multiple iteration marks	2022-11-27 00:33:49 +01:00
ThetaDev	55e91fb17f	fix: tweaks to normalization	2022-11-26 23:40:45 +01:00