Compare commits

...

2 commits

2 changed files with 128 additions and 74 deletions

View file

@ -6,8 +6,6 @@ mod util;
pub use types::KakasiResult; pub use types::KakasiResult;
use std::borrow::Cow;
use unicode_normalization::UnicodeNormalization; use unicode_normalization::UnicodeNormalization;
use phfbin::PhfMap; use phfbin::PhfMap;
@ -16,68 +14,67 @@ use types::{CharType, KanjiString, Readings};
pub fn convert(text: &str) -> KakasiResult { pub fn convert(text: &str) -> KakasiResult {
let dict = PhfMap::new(util::KANJI_DICT); let dict = PhfMap::new(util::KANJI_DICT);
// TODO: char conversion should be done with iterators let text = normalize(&text);
let text = text.nfkc().collect::<String>();
let text = convert_syn(&text);
let mut char_indices = text.char_indices().peekable(); let mut char_indices = text.char_indices().peekable();
let mut kana_buf = String::new(); let mut kana_buf = String::new();
let mut prev_buf_type = CharType::Whitespace; let mut prev_buf_type = CharType::Whitespace;
let mut prev_acc_type = CharType::Whitespace; let mut prev_acc_type = CharType::Whitespace;
let mut cap = (false, false); // Capitalization flags
// 0: capitalize next word, 1: capitalize first sentence, 2: first sentence capitalized
let mut cap = (false, false, false);
let mut res = KakasiResult::default(); let mut res = KakasiResult::default();
let conv_kana_buf = |kana_buf: &mut String, let conv_kana_buf = |kana_buf: &mut String,
res: &mut KakasiResult, res: &mut KakasiResult,
prev_type: CharType, prev_type: &mut CharType,
cap: &mut (bool, bool)| { cap: &mut (bool, bool, bool)| {
if !kana_buf.is_empty() { if !kana_buf.is_empty() {
let hira = convert_katakana(kana_buf); let hira = convert_katakana(kana_buf);
res.hiragana.push_str(&hira); res.hiragana.push_str(&hira);
let mut rom = hiragana_to_romaji(&hira); let mut rom = hiragana_to_romaji(&hira);
if cap.0 { if cap.0 {
let done; rom = util::capitalize_first_c(&rom);
(rom, done) = util::capitalize_first_c(&rom); cap.0 = false;
cap.0 = !done; }
if cap.1 && !cap.2 {
if !cap.1 { res.romaji = util::capitalize_first_c(&res.romaji);
(res.romaji, _) = util::capitalize_first_c(&res.romaji); cap.2 = true;
cap.1 = true;
}
} }
util::ensure_trailing_space( util::ensure_trailing_space(
&mut res.romaji, &mut res.romaji,
prev_type != CharType::LeadingPunct && prev_type != CharType::JoiningPunct, *prev_type != CharType::LeadingPunct && *prev_type != CharType::JoiningPunct,
); );
res.romaji.push_str(&rom); res.romaji.push_str(&rom);
kana_buf.clear(); kana_buf.clear();
*prev_type = CharType::Hiragana;
} }
}; };
while let Some((i, c)) = char_indices.next() { while let Some((i, c)) = char_indices.next() {
if util::is_char_in_range(c, util::HIRAGANA) { if util::is_char_in_range(c, util::HIRAGANA) {
if prev_buf_type != CharType::Hiragana { if prev_buf_type != CharType::Hiragana {
conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap); conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
} }
kana_buf.push(c); kana_buf.push(c);
prev_buf_type = CharType::Hiragana; prev_buf_type = CharType::Hiragana;
} else if util::is_char_in_range(c, util::KATAKANA) { } else if util::is_char_in_range(c, util::KATAKANA) {
if prev_buf_type != CharType::Katakana { if prev_buf_type != CharType::Katakana {
conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap); conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
} }
kana_buf.push(c); kana_buf.push(c);
prev_buf_type = CharType::Katakana; prev_buf_type = CharType::Katakana;
} else if util::is_char_in_range(c, util::KANJI) { } else if util::is_char_in_range(c, util::KANJI) {
let (t, n) = convert_kanji(&text[i..], &kana_buf, &dict); let (t, n) = convert_kanji(&text[i..], &kana_buf, &dict);
conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap); conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
if n > 0 { if n > 0 {
kana_buf = t; kana_buf = t;
conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap); conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
for _ in 1..n { for _ in 1..n {
char_indices.next(); char_indices.next();
} }
@ -89,18 +86,18 @@ pub fn convert(text: &str) -> KakasiResult {
} }
prev_acc_type = CharType::Kanji; prev_acc_type = CharType::Kanji;
} else if c.is_whitespace() { } else if c.is_whitespace() {
conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap); conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
res.hiragana.push(c); res.hiragana.push(c);
res.romaji.push(c); res.romaji.push(c);
prev_acc_type = CharType::Whitespace; prev_acc_type = CharType::Whitespace;
} else if c == '・' { } else if c == '・' {
conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap); conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
res.hiragana.push(c); res.hiragana.push(c);
res.romaji.push(' '); res.romaji.push(' ');
prev_acc_type = CharType::Whitespace; prev_acc_type = CharType::Whitespace;
} else if c == util::PROLONGED_SOUND_MARK { } else if c == util::PROLONGED_SOUND_MARK {
if prev_buf_type != CharType::Hiragana && prev_buf_type != CharType::Katakana { if prev_buf_type != CharType::Hiragana && prev_buf_type != CharType::Katakana {
conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap); conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
} }
kana_buf.push(c); kana_buf.push(c);
prev_buf_type = match prev_buf_type { prev_buf_type = match prev_buf_type {
@ -108,7 +105,7 @@ pub fn convert(text: &str) -> KakasiResult {
_ => CharType::Katakana, _ => CharType::Katakana,
}; };
} else { } else {
conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap); conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
res.hiragana.push(c); res.hiragana.push(c);
let (c_rom, char_type) = util::PCT_DICT.get(&c).copied().unwrap_or_else(|| { let (c_rom, char_type) = util::PCT_DICT.get(&c).copied().unwrap_or_else(|| {
@ -129,8 +126,8 @@ pub fn convert(text: &str) -> KakasiResult {
) )
}); });
if (prev_acc_type != CharType::Other && prev_acc_type != CharType::Numeric) let is_jpunct = util::is_char_japanese_punctuation(c);
|| util::is_char_japanese_punctuation(c) if (prev_acc_type != CharType::Other && prev_acc_type != CharType::Numeric) || is_jpunct
{ {
util::ensure_trailing_space( util::ensure_trailing_space(
&mut res.romaji, &mut res.romaji,
@ -140,17 +137,24 @@ pub fn convert(text: &str) -> KakasiResult {
&& char_type != CharType::JoiningPunct, && char_type != CharType::JoiningPunct,
); );
} }
res.romaji.push(c_rom);
if c_rom == '.' && char_type != CharType::Numeric { // Japanese punctuation was not normalized at the beginning,
cap.0 = true; // the normalization here will replace fullwidth characters with normal ones.
if is_jpunct && char_type == CharType::Other {
res.romaji.extend(c_rom.nfkc());
} else {
res.romaji.push(c_rom);
} }
cap.0 = c_rom == '.' && char_type != CharType::Numeric
|| cap.0 && matches!(char_type, CharType::LeadingPunct | CharType::JoiningPunct);
cap.1 |= cap.0;
prev_acc_type = char_type; prev_acc_type = char_type;
}; };
} }
conv_kana_buf(&mut kana_buf, &mut res, prev_acc_type, &mut cap); conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
res res
} }
@ -304,43 +308,56 @@ fn convert_kanji(text: &str, btext: &str, dict: &PhfMap) -> (String, usize) {
translation.map(|tl| (tl, n_c)).unwrap_or_default() translation.map(|tl| (tl, n_c)).unwrap_or_default()
} }
/// Convert all synonymous kanji and replace iteration characters (`々`) /// NFKC-normalize the text, convert all synonymous kanji
/// /// and replace iteration marks (`々`)
/// The input text needs to be NFKC-normalized. fn normalize(text: &str) -> String {
fn convert_syn(text: &str) -> Cow<str> { let mut imcount = 0;
let mut replacements = text let replacements = text.char_indices().filter_map(|(i, c)| {
.char_indices() if c == util::ITERATION_MARK {
.filter_map(|(i, c)| { // Count iteration marks
if imcount == 0 {
imcount = 1;
for c in text[i + c.len_utf8()..].chars() {
if c == util::ITERATION_MARK {
imcount += 1;
} else {
break;
}
}
}
// Replace withe the character imcount positions before
let mut chars_rev = text[0..i].chars().rev();
for _ in 1..imcount {
chars_rev.next();
}
chars_rev.next().map(|prev| (i, c.len_utf8(), prev))
} else {
imcount = 0;
syn_dict::SYN_DICT syn_dict::SYN_DICT
.get(&c) .get(&c)
.map(|r_char| (i, c.len_utf8(), *r_char)) .map(|r_char| (i, c.len_utf8(), *r_char))
.or_else(|| { .or_else(|| {
if c == util::ITERATION_MARK { // Dont normalize japanese punctuation, we need it to add correct spacing
text[0..i] if util::is_char_fwidth_punctuation(c) {
.chars() Some((i, c.len_utf8(), c))
.last()
.map(|prev| (i, c.len_utf8(), prev))
} else { } else {
None None
} }
}) })
}) }
.peekable(); });
if replacements.peek().is_none() {
return Cow::Borrowed(text);
}
let mut new = String::with_capacity(text.len()); let mut new = String::with_capacity(text.len());
let mut last = 0; let mut last = 0;
for (i, clen, r_char) in replacements { for (i, clen, r_char) in replacements {
new.push_str(&text[last..i]); new.extend(text[last..i].nfkc());
new.push(r_char); new.push(r_char);
last = i + clen; last = i + clen;
} }
new.push_str(&text[last..]); new.extend(text[last..].nfkc());
Cow::Owned(new) new
} }
#[cfg(test)] #[cfg(test)]
@ -354,7 +371,7 @@ mod tests {
#[case("", "...")] #[case("", "...")]
#[case("", "..")] #[case("", "..")]
#[case("\u{FF70}", "\u{30FC}")] #[case("\u{FF70}", "\u{30FC}")]
fn t_normalize(#[case] text: &str, #[case] expect: &str) { fn t_unicode_nfkc(#[case] text: &str, #[case] expect: &str) {
let res = text.nfkc().collect::<String>(); let res = text.nfkc().collect::<String>();
assert_eq!(res, expect); assert_eq!(res, expect);
} }
@ -363,8 +380,8 @@ mod tests {
#[case("壱意", "一意")] #[case("壱意", "一意")]
#[case("", "")] #[case("", "")]
#[case("Abc", "Abc")] #[case("Abc", "Abc")]
fn t_convert_syn(#[case] text: &str, #[case] expect: &str) { fn t_normalize(#[case] text: &str, #[case] expect: &str) {
let res = convert_syn(text); let res = normalize(text);
assert_eq!(res, expect); assert_eq!(res, expect);
} }
@ -455,7 +472,7 @@ mod tests {
#[case("アヷーリヤ品", "あゔぁーりやひん", "avaariya hin")] #[case("アヷーリヤ品", "あゔぁーりやひん", "avaariya hin")]
#[case( #[case(
"安藤 和風(あんどう はるかぜ、慶応2年1月12日1866年2月26日 - 昭和11年1936年12月26日は、日本のジャーナリスト、マスメディア経営者、俳人、郷土史研究家。通名および俳号は「和風」をそのまま音読みして「わふう」。秋田県の地方紙「秋田魁新報」の事業拡大に貢献し、秋田魁新報社三大柱石の一人と称された。「魁の安藤か、安藤の魁か」と言われるほど、新聞記者としての名声を全国にとどろかせた[4]。", "安藤 和風(あんどう はるかぜ、慶応2年1月12日1866年2月26日 - 昭和11年1936年12月26日は、日本のジャーナリスト、マスメディア経営者、俳人、郷土史研究家。通名および俳号は「和風」をそのまま音読みして「わふう」。秋田県の地方紙「秋田魁新報」の事業拡大に貢献し、秋田魁新報社三大柱石の一人と称された。「魁の安藤か、安藤の魁か」と言われるほど、新聞記者としての名声を全国にとどろかせた[4]。",
"あんどう わふう(あんどう はるかぜ、けいおう2ねん1がつ12にち(1866ねん2がつ26にち) - しょうわ11ねん(1936ねん)12がつ26にち)は、にっぽんのじゃーなりすと、ますめでぃあけいえいしゃ、はいじん、きょうどしけんきゅうか。とおりめいおよびはいごうは「わふう」をそのままおんよみして「わふう」。あきたけんのちほうし「あきたかいしんぽう」のじぎょうかくだいにこうけんし、あきたかいしんぽうしゃさんだいちゅうせきのひとりとしょうされた。「かいのあんどうか、あんどうのかいか」といわれるほど、しんぶんきしゃとしてのめいせいをぜんこくにとどろかせた[4]。", "あんどう わふう(あんどう はるかぜ、けいおう2ねん1がつ12にち1866ねん2がつ26にち - しょうわ11ねん1936ねん12がつ26にちは、にっぽんのじゃーなりすと、ますめでぃあけいえいしゃ、はいじん、きょうどしけんきゅうか。とおりめいおよびはいごうは「わふう」をそのままおんよみして「わふう」。あきたけんのちほうし「あきたかいしんぽう」のじぎょうかくだいにこうけんし、あきたかいしんぽうしゃさんだいちゅうせきのひとりとしょうされた。「かいのあんどうか、あんどうのかいか」といわれるほど、しんぶんきしゃとしてのめいせいをぜんこくにとどろかせた[4]。",
"Andou wafuu (andou harukaze, keiou 2 nen 1 gatsu 12 nichi (1866 nen 2 gatsu 26 nichi) - shouwa 11 nen (1936 nen) 12 gatsu 26 nichi) ha, nippon no jaanarisuto, masumedia keieisha, haijin, kyoudoshi kenkyuuka. Toori mei oyobi hai gou ha \"wafuu\" wosonomama onyomi shite \"wafuu\". Akitaken no chihoushi \"akita kai shinpou\" no jigyou kakudai ni kouken shi, akita kai shinpou sha sandai chuuseki no hitori to shousa reta. \"Kai no andou ka, andou no kai ka\" to iwa reruhodo, shinbunkisha toshiteno meisei wo zenkoku nitodorokaseta [4].", "Andou wafuu (andou harukaze, keiou 2 nen 1 gatsu 12 nichi (1866 nen 2 gatsu 26 nichi) - shouwa 11 nen (1936 nen) 12 gatsu 26 nichi) ha, nippon no jaanarisuto, masumedia keieisha, haijin, kyoudoshi kenkyuuka. Toori mei oyobi hai gou ha \"wafuu\" wosonomama onyomi shite \"wafuu\". Akitaken no chihoushi \"akita kai shinpou\" no jigyou kakudai ni kouken shi, akita kai shinpou sha sandai chuuseki no hitori to shousa reta. \"Kai no andou ka, andou no kai ka\" to iwa reruhodo, shinbunkisha toshiteno meisei wo zenkoku nitodorokaseta [4].",
)] )]
#[case( #[case(
@ -465,12 +482,12 @@ mod tests {
)] )]
#[case( #[case(
"緑黄色社会『ミチヲユケ』Official Video -「ファーストペンギン!」主題歌", "緑黄色社会『ミチヲユケ』Official Video -「ファーストペンギン!」主題歌",
"みどりきいろしゃかい『みちをゆけ』Official Video -「ふぁーすとぺんぎん!」しゅだいか", "みどりきいろしゃかい『みちをゆけ』Official Video -「ふぁーすとぺんぎん」しゅだいか",
"midori kiiro shakai \"michiwoyuke\" Official Video - \"faasutopengin!\" shudaika" "midori kiiro shakai \"michiwoyuke\" Official Video - \"faasutopengin!\" shudaika"
)] )]
#[case( #[case(
"MONKEY MAJIK - Running In The Dark【Lyric Video】日本語字幕付", "MONKEY MAJIK - Running In The Dark【Lyric Video】日本語字幕付",
"MONKEY MAJIK - Running In The Dark【Lyric Video】(にほんごじまくつき)", "MONKEY MAJIK - Running In The Dark【Lyric Video】(にほんごじまくつき)",
"MONKEY MAJIK - Running In The Dark [Lyric Video] (nihongo jimaku tsuki)" "MONKEY MAJIK - Running In The Dark [Lyric Video] (nihongo jimaku tsuki)"
)] )]
#[case( #[case(
@ -478,6 +495,23 @@ mod tests {
"とりしまりやくだいにせいさくぎじゅつぶぶちょう", "とりしまりやくだいにせいさくぎじゅつぶぶちょう",
"torishimariyaku daini seisaku gijutsubu buchou" "torishimariyaku daini seisaku gijutsubu buchou"
)] )]
#[case(
"最初の安定版である1.0版がリリ",
"さいしょのあんていはんである1.0はんがりり",
"saisho no antei han dearu 1.0 han ga riri"
)]
#[case("にゃ$にゃ", "にゃ$にゃ", "nya $ nya")]
#[case(
"安定版となるRust 1.0がリリースされた[84]。1.0版の後、安定版およびベータ版が6週間おきに定期リリースされている[85]。",
"あんていはんとなるRust 1.0がりりーすされた[84]。1.0はんののち、あんていはんおよびべーたはんが6しゅうかんおきにていきりりーすされている[85]。",
"Antei han tonaru Rust 1.0 ga ririisu sareta [84]. 1.0 han no nochi, antei han oyobi beeta han ga 6 shuukan okini teiki ririisu sareteiru [85]."
)]
#[case(
"prelude文にTryIntoやTryFrom",
"preludeぶんにTryIntoやTryFrom",
"prelude bun ni TryInto ya TryFrom"
)]
#[case("要所々々", "ようしょようしょ", "yousho yousho")]
fn romanize(#[case] text: &str, #[case] hiragana: &str, #[case] romaji: &str) { fn romanize(#[case] text: &str, #[case] hiragana: &str, #[case] romaji: &str) {
let res = convert(text); let res = convert(text);
assert_eq!(res.hiragana, hiragana); assert_eq!(res.hiragana, hiragana);

View file

@ -51,19 +51,18 @@ pub static PCT_DICT: phf::Map<char, (char, CharType)> = phf::phf_map!(
'〙' => (')', CharType::TrailingPunct), '〙' => (')', CharType::TrailingPunct),
'〝' => ('"', CharType::LeadingPunct), '〝' => ('"', CharType::LeadingPunct),
'〟' => ('"', CharType::TrailingPunct), '〟' => ('"', CharType::TrailingPunct),
'' => (':', CharType::TrailingPunct),
'.' => ('.', CharType::TrailingPunct), '' => (';', CharType::TrailingPunct),
',' => (',', CharType::TrailingPunct), '' => ('!', CharType::TrailingPunct),
':' => (':', CharType::TrailingPunct), '' => ('?', CharType::TrailingPunct),
';' => (';', CharType::TrailingPunct), '' => ('?', CharType::LeadingPunct),
'!' => ('!', CharType::TrailingPunct), '' => (')', CharType::TrailingPunct),
'?' => ('?', CharType::TrailingPunct), '' => (']', CharType::TrailingPunct),
')' => (')', CharType::TrailingPunct), '' => ('}', CharType::TrailingPunct),
']' => (']', CharType::TrailingPunct), '' => ('(', CharType::LeadingPunct),
'}' => ('}', CharType::TrailingPunct), '' => ('[', CharType::LeadingPunct),
'(' => ('(', CharType::LeadingPunct), '' => ('{', CharType::LeadingPunct),
'[' => ('[', CharType::LeadingPunct), '_' => ('{', CharType::JoiningPunct),
'{' => ('{', CharType::LeadingPunct),
); );
pub const HIRAGANA: (u32, u32) = (0x3041, 0x3096); pub const HIRAGANA: (u32, u32) = (0x3041, 0x3096);
@ -76,11 +75,26 @@ pub const PROLONGED_SOUND_MARK: char = 'ー';
const CJK_SYMBOLS_PUNCTUATION: (u32, u32) = (0x3000, 0x303F); const CJK_SYMBOLS_PUNCTUATION: (u32, u32) = (0x3000, 0x303F);
const KANA_PUNCTUATION: (u32, u32) = (0xFF61, 0xFF65); const KANA_PUNCTUATION: (u32, u32) = (0xFF61, 0xFF65);
const KATAKANA_PUNCTUATION: (u32, u32) = (0x30FB, 0x30FC); const KATAKANA_PUNCTUATION: (u32, u32) = (0x30FB, 0x30FC);
pub const ZENKAKU_PUNCTUATION_1: (u32, u32) = (0xFF01, 0xFF0F);
pub const ZENKAKU_PUNCTUATION_2: (u32, u32) = (0xFF1A, 0xFF1F);
pub const ZENKAKU_PUNCTUATION_3: (u32, u32) = (0xFF3B, 0xFF3F);
pub const ZENKAKU_PUNCTUATION_4: (u32, u32) = (0xFF5B, 0xFF60);
const JA_PUNCTUATION_RANGES: [(u32, u32); 3] = [ const JA_PUNCTUATION_RANGES: [(u32, u32); 7] = [
CJK_SYMBOLS_PUNCTUATION, CJK_SYMBOLS_PUNCTUATION,
KANA_PUNCTUATION, KANA_PUNCTUATION,
KATAKANA_PUNCTUATION, KATAKANA_PUNCTUATION,
ZENKAKU_PUNCTUATION_1,
ZENKAKU_PUNCTUATION_2,
ZENKAKU_PUNCTUATION_3,
ZENKAKU_PUNCTUATION_4,
];
const FW_PUNCTUATION_RANGES: [(u32, u32); 4] = [
ZENKAKU_PUNCTUATION_1,
ZENKAKU_PUNCTUATION_2,
ZENKAKU_PUNCTUATION_3,
ZENKAKU_PUNCTUATION_4,
]; ];
pub fn is_char_in_range(c: char, range: (u32, u32)) -> bool { pub fn is_char_in_range(c: char, range: (u32, u32)) -> bool {
@ -93,7 +107,13 @@ pub fn is_char_japanese_punctuation(c: char) -> bool {
.any(|r| is_char_in_range(c, *r)) .any(|r| is_char_in_range(c, *r))
} }
pub fn capitalize_first_c(text: &str) -> (String, bool) { pub fn is_char_fwidth_punctuation(c: char) -> bool {
FW_PUNCTUATION_RANGES
.iter()
.any(|r| is_char_in_range(c, *r))
}
pub fn capitalize_first_c(text: &str) -> String {
let mut done = false; let mut done = false;
let res = text let res = text
.chars() .chars()
@ -106,7 +126,7 @@ pub fn capitalize_first_c(text: &str) -> (String, bool) {
} }
}) })
.collect::<String>(); .collect::<String>();
(res, done) res
} }
pub fn ensure_trailing_space(text: &mut String, ts: bool) { pub fn ensure_trailing_space(text: &mut String, ts: bool) {