Begin rewriting translation function to be more efficient

1 year ago · 1c1df67d64
parent 10af4bd813
commit 1c1df67d64
1 changed files with 151 additions and 0 deletions
--- a/src/byte_string.rs
+++ b/src/byte_string.rs
@ -349,6 +349,157 @@ pub(crate) fn translate_with_style_lower_and_upper_suffixes (
        return;
    }

+    #[derive(Debug, Clone, Copy)]
+    enum State {
+        InBetweenWords,
+        //InFirstLetterOfWord,//TODO to specially handle one-letter words
+        InRegularWord,
+        FinishingRegularWord,
+        InWordStartingWithVowel,
+        InContractionSuffix
+    };
+
+    let mut current_state: State = State::InBetweenWords;
+    let mut word_start_index: usize = 0xDEADBEEF;//Inclusive
+    let mut first_vowel_index: usize = 0xDEADBEEF;//Also exclusive end of starting consonants
+    let mut start_of_in_between_words_index: usize = 0;//Inclusive; Also exclusive end of word
+
+    for i in 0..english.len() {
+        //println!("{:?}", current_state);
+        match current_state {
+            State::InBetweenWords => {
+                if english[i].is_ascii_alphabetic() {
+                    //This is the start of the word, so copy all non-word characters up to this point since the last word
+                    let in_between_words_characters_slice = &english[start_of_in_between_words_index..i];
+                    pig_latin_string.extend_from_slice(in_between_words_characters_slice);
+
+                    //Setup things to begin processing the word
+                    word_start_index = i;
+                    if is_vowel(english[i]) {//As a herustic, we consider Y to be a vowel when it is not at the start of the word
+                        current_state = State::InWordStartingWithVowel;
+                    } else {
+                        current_state = State::InRegularWord;
+                    }
+                }
+            },
+            State::InRegularWord => {
+                if is_vowel(english[i]) || is_y(english[i]) {
+                    first_vowel_index = i;
+                    current_state = State::FinishingRegularWord;
+                }
+                //TODO handle the case where the word ends here before a vowel is encountered
+            },
+            State::FinishingRegularWord => {
+                if !english[i].is_ascii_alphabetic() {//End of word
+                    //We now need to actually translate the word
+                    if fast_is_ascii_uppercase(english[word_start_index]) {//The first letter of the word was uppercase
+                        if fast_is_ascii_uppercase(english[first_vowel_index]) {//Heuristic: Assume the word was uppercase if the first vowel is
+                            //TODO
+                        } else {
+                            //Push the vowel, matching the starting case of the original word
+                            pig_latin_string.push(fast_to_ascii_uppercase(english[first_vowel_index]));
+
+                            //Push all letters after the vowel
+                            let after_vowel_slice = &english[(first_vowel_index + 1)..i];
+                            pig_latin_string.extend_from_slice(after_vowel_slice);
+
+                            //Push the first starting consonant, which should be lowercase now
+                            pig_latin_string.push(fast_to_ascii_lowercase(english[word_start_index]));
+
+                            //Push the remaining starting consonants
+                            let after_start_to_vowel_slice = &english[(word_start_index + 1)..first_vowel_index];
+                            pig_latin_string.extend_from_slice(after_start_to_vowel_slice);
+
+                            //Push the normal suffix
+                            pig_latin_string.extend_from_slice(suffix_lower);
+                        }
+                    } else {//The first letter of the word was lowercase
+                        //Push the vowel and all letters after it
+                        let vowel_to_end_slice = &english[first_vowel_index..i];
+                        pig_latin_string.extend_from_slice(vowel_to_end_slice);
+
+                        //Push the starting consonants
+                        let start_to_vowel_slice = &english[word_start_index..first_vowel_index];
+                        pig_latin_string.extend_from_slice(start_to_vowel_slice);
+
+                        //Push the normal suffix
+                        pig_latin_string.extend_from_slice(suffix_lower);
+                    }
+
+                    //Decide the next state
+                    if english[i] == b'\'' {
+                        current_state = State::InContractionSuffix;
+                    } else {
+                        current_state = State::InBetweenWords;
+                    }
+                    start_of_in_between_words_index = i;
+                }
+            }
+            State::InWordStartingWithVowel => {
+                if !english[i].is_ascii_alphabetic() {//End of word
+                    //We now need to actually translate the word
+                    //TODO handle uppercase
+                    let word_slice = &english[word_start_index..i];
+                    pig_latin_string.extend_from_slice(word_slice);
+                    pig_latin_string.extend_from_slice(special_case_suffix_lower);
+
+                    //Decide the next state
+                    if english[i] == b'\'' {
+                        current_state = State::InContractionSuffix;
+                    } else {
+                        current_state = State::InBetweenWords;
+                    }
+                    start_of_in_between_words_index = i;
+                }
+            },
+            State::InContractionSuffix => {
+                if !english[i].is_ascii_alphabetic() {//End of contraction suffix
+                    //Push the contraction suffix
+                    let contraction_suffix_slice = &english[start_of_in_between_words_index..i];
+                    pig_latin_string.extend_from_slice(contraction_suffix_slice);
+
+                    //We're back in-between words
+                    start_of_in_between_words_index = i;
+                    current_state = State::InBetweenWords;
+                }
+            },
+        }
+    }
+
+    //Wrap-up based on the state we ended the loop in
+    match current_state {
+        State::InBetweenWords => {
+            //Copy remaining characters
+            let remaining_characters_slice = &english[start_of_in_between_words_index..];
+            pig_latin_string.extend_from_slice(remaining_characters_slice);
+        },
+        State::InRegularWord => {
+            //TODO
+        },
+        State::FinishingRegularWord => {
+            //TODO
+        }
+        State::InWordStartingWithVowel => {
+            //TODO
+        },
+        State::InContractionSuffix => {
+            //Push the contraction suffix
+            let contraction_suffix_slice = &english[start_of_in_between_words_index..i];
+            pig_latin_string.extend_from_slice(contraction_suffix_slice);
+        },
+    }
+}
+
+//Avoids the overhead of having to convert suffixes to uppercase for the standard translation functions at runtime
+pub(crate) fn translate_with_style_lower_and_upper_suffixes_old (
+    english: &[u8],
+    suffix_lower: &[u8], special_case_suffix_lower: &[u8], suffix_upper: &[u8], special_case_suffix_upper: &[u8],
+    pig_latin_string: &mut Vec::<u8>
+) {
+    if english.is_empty() {
+        return;
+    }
+
    //TODO merge the word and the generic text function into one function to allow for optimizations with certain things
    //TODO do an SSE/AVX optimized version of this