Compare commits

...

7 Commits
v0.5.0 ... main

  1. 167
      src/byte_string.rs

@ -349,6 +349,173 @@ pub(crate) fn translate_with_style_lower_and_upper_suffixes (
return;
}
let mut global_index: usize = 0;
loop {
//Copies characters in-between words
//TODO this could probably be optimized with vector instructions
{
let mut start_of_in_between_words_index: usize = global_index;//Inclusive
loop {
if english[global_index].is_ascii_alphabetic() {//Start of a word
break;
}
global_index += 1;
if global_index == english.len() {
//Copy all of the characters so far (all that remain) and return
let remaining_characters_slice = &english[start_of_in_between_words_index..];
pig_latin_string.extend_from_slice(remaining_characters_slice);
return;
}
}
//Copy the characters in-between words as-is
let in_between_words_characters_slice = &english[start_of_in_between_words_index..global_index];
pig_latin_string.extend_from_slice(in_between_words_characters_slice);
//At this point, global_index contains the index to the start of the word to translate
}
//Translates the current word
{
let word_start_index = global_index;
let first_letter = english[word_start_index];
global_index += 1;
if (global_index == english.len()) || (!english[global_index].is_ascii_alphabetic()) {//The word is only one letter long (special case)
//Push the letter and add the lowercase special suffix (even if the letter is uppercase)
pig_latin_string.push(first_letter);
pig_latin_string.extend_from_slice(special_case_suffix_lower);
} else if is_vowel(first_letter) {//The word is longer than a letter and starts with a vowel (special case)
//As a heuristic, we consider Y to be a vowel when it is not at the start of the word
//Get the slice containing the whole word
let slice_to_search_for_end = &english[global_index..];
let word_slice: &[u8];
if let Some(found_end_of_word_index) = slice_to_search_for_end.iter().position(|&x| !x.is_ascii_alphabetic()) {//We found a non-letter that ends the word
global_index += found_end_of_word_index;
word_slice = &english[word_start_index..global_index];
} else {//The string ended
global_index = english.len();
word_slice = slice_to_search_for_end;
}
//Translate the word and push it
pig_latin_string.extend_from_slice(word_slice);
if fast_is_ascii_uppercase(english[word_start_index + 1]) {//As a heuristic, we consider the word to be uppercase if the second letter is
pig_latin_string.extend_from_slice(special_case_suffix_upper);
} else {//Word is entirely lowercase, or its first letter is uppercase only
pig_latin_string.extend_from_slice(special_case_suffix_lower);
}
} else {//The word is longer than a letter and doesn't start with a vowel
//Find the first vowel; we assume the word actually has a vowel in it
let first_vowel_index: usize;
let slice_to_search_for_vowel = &english[global_index..];
if let Some(first_vowel_of_word_index) = slice_to_search_for_vowel.iter().position(|&x| { is_vowel(x) || is_y(x) }) {//As a heuristic, we consider Y to be a vowel when it is not at the start of the word
global_index += first_vowel_of_word_index;
} else {//This string ended and we never found a vowel
return;//Just give up
}
first_vowel_index = global_index;
//Find the end of the word
let word_end_index: usize;
let slice_to_search_for_end = &english[global_index..];
if let Some(end_of_word_index) = slice_to_search_for_end.iter().position(|&x| !x.is_ascii_alphabetic()) {//We found a non-letter that ends the word
global_index += end_of_word_index;
} else {//The string ended
global_index = english.len();
}
word_end_index = global_index;
//Translate the word
//TODO improve code reuse here
if fast_is_ascii_uppercase(first_letter) {//Check if the first letter is uppercase
if fast_is_ascii_uppercase(english[word_start_index + 1]) {//As a heuristic, we consider the word to be uppercase if the second letter is
//Push the vowel and all letters after it
let vowel_to_end_slice = &english[first_vowel_index..word_end_index];
pig_latin_string.extend_from_slice(vowel_to_end_slice);
//Push the starting consonants
let start_to_vowel_slice = &english[word_start_index..first_vowel_index];
pig_latin_string.extend_from_slice(start_to_vowel_slice);
//Push the normal suffix (uppercase)
pig_latin_string.extend_from_slice(suffix_upper);
} else {//Word starts with an uppercase letter, but is otherwise lowercase
//Push the vowel, matching the starting case of the original word
pig_latin_string.push(fast_to_ascii_uppercase(english[first_vowel_index]));
//Push all letters after the vowel
let after_vowel_slice = &english[(first_vowel_index + 1)..word_end_index];
pig_latin_string.extend_from_slice(after_vowel_slice);
//Push the first starting consonant, which should be lowercase now
pig_latin_string.push(fast_to_ascii_lowercase(english[word_start_index]));
//Push the remaining starting consonants
let after_start_to_vowel_slice = &english[(word_start_index + 1)..first_vowel_index];
pig_latin_string.extend_from_slice(after_start_to_vowel_slice);
//Push the normal suffix
pig_latin_string.extend_from_slice(suffix_lower);
}
} else {//Word is entirely lowercase
//Push the vowel and all letters after it
let vowel_to_end_slice = &english[first_vowel_index..word_end_index];
pig_latin_string.extend_from_slice(vowel_to_end_slice);
//Push the starting consonants
let start_to_vowel_slice = &english[word_start_index..first_vowel_index];
pig_latin_string.extend_from_slice(start_to_vowel_slice);
//Push the normal suffix (lowercase)
pig_latin_string.extend_from_slice(suffix_lower);
}
}
//Don't go on if we reached the end of the string during the word
if global_index == english.len() {
return;
}
//At this point, global_index contains the index to the next character to check
}
//Copies contraction suffixes, if present
if english[global_index] == b'\'' {//TODO if this is true we can also skip the regular inter-word loop on the next iteration
let mut start_of_contraction_suffix_index: usize = global_index;//Inclusive
global_index += 1;//We skip over the apostrophe for the loop below, but we still want to copy it in the end
loop {
if global_index == english.len() {
//Copy all of the characters so far (all that remain) and return
let remaining_characters_slice = &english[start_of_contraction_suffix_index..];
pig_latin_string.extend_from_slice(remaining_characters_slice);
return;
}
if !english[global_index].is_ascii_alphabetic() {//End of the contraction suffix
break;
}
global_index += 1;
}
//Copy the contraction suffix as-is
let contraction_suffix_slice = &english[start_of_contraction_suffix_index..global_index];
pig_latin_string.extend_from_slice(contraction_suffix_slice);
}
}
}
//Avoids the overhead of having to convert suffixes to uppercase for the standard translation functions at runtime
pub(crate) fn translate_with_style_lower_and_upper_suffixes_old (
english: &[u8],
suffix_lower: &[u8], special_case_suffix_lower: &[u8], suffix_upper: &[u8], special_case_suffix_upper: &[u8],
pig_latin_string: &mut Vec::<u8>
) {
if english.is_empty() {
return;
}
//TODO merge the word and the generic text function into one function to allow for optimizations with certain things
//TODO do an SSE/AVX optimized version of this

Loading…
Cancel
Save