Begin transitioning ascii functions (at least internally) to Vec<u8> for greater speed

1 year ago · efbcba6080
parent 39bdc0eb62
commit efbcba6080
3 changed files with 96 additions and 11 deletions
--- a/src/helpers.rs
+++ b/src/helpers.rs
@ -41,6 +41,13 @@ pub(crate) fn word_is_uppercase_ascii(english_word_bytes: &[u8]) -> bool {
    return (english_word_bytes[english_word_bytes.len() - 1] as char).is_ascii_uppercase();
 }

+//Clones each element of a slice and push()es it to a vector
+pub(crate) fn push_slice_to_vector<T: Clone>(vec: &mut Vec<T>, slice: &[T]) {
+    for element in slice {
+        vec.push(element.clone());
+    }
+}
+
 /* Tests */

 #[cfg(test)]
--- a/src/translate_strings.rs
+++ b/src/translate_strings.rs
@ -552,7 +552,7 @@ pub fn translate_with_style_ascii(english: &str, suffix_lower: &str, special_cas
    let mut in_contraction_suffix: bool = false;

    //Buffer for improved performance (avoid repeated heap allocations)
-    let mut starting_consonants_buffer = String::with_capacity(64);//Longer than basically all English words to avoid unneeded allocations, plus the fact that this isn't the whole word
+    let mut starting_consonants_buffer = Vec::<u8>::with_capacity(64);//Longer than basically all English words to avoid unneeded allocations, plus the fact that this isn't the whole word

    //Indexes for improved performance (avoid copying characters to use as the english_word argument for translate_word_with_style_reuse_buffers)
    //However, this assumes each character is one byte, so this only works with ASCII strings
--- a/src/translate_words.rs
+++ b/src/translate_words.rs
@ -8,7 +8,7 @@

 /* Imports */

-use crate::helpers::{is_vowel, is_y, word_is_uppercase, word_is_uppercase_ascii};
+use crate::helpers::{is_vowel, is_y, word_is_uppercase, word_is_uppercase_ascii, push_slice_to_vector};

 /* Functions */

@ -92,7 +92,7 @@ pub(crate) fn translate_word_with_style_reuse_buffers (
 pub(crate) fn translate_word_with_style_reuse_buffers_ascii (
    english_word: &str,//Assumes this word is not empty
    suffix_lower: &str, special_case_suffix_lower: &str, suffix_upper: &str, special_case_suffix_upper: &str,
-    buffer_to_append_to: &mut String, starting_consonants: &mut String
+    buffer_to_append_to: &mut String, starting_consonants: &mut Vec<u8>
 ) {
    let english_word_bytes: &[u8] = english_word.as_bytes();

@ -122,7 +122,7 @@ pub(crate) fn translate_word_with_style_reuse_buffers_ascii (
        buffer_to_append_to.push(first_letter);
    } else {
        let first_char_was_upper = first_letter.is_ascii_uppercase();
-        starting_consonants.push(if word_uppercase { first_letter } else { first_letter.to_ascii_lowercase() });
+        starting_consonants.push(if word_uppercase { first_letter as u8 } else { first_letter.to_ascii_lowercase() as u8 });

        //Grab all of the starting consonants, and push the first vowel we enounter to buffer_to_append_to
        while index < english_word_bytes.len() {
@ -136,7 +136,7 @@ pub(crate) fn translate_word_with_style_reuse_buffers_ascii (
                }
                break;
            } else {
-                starting_consonants.push(character);
+                starting_consonants.push(character as u8);
            }

            index += 1;
@ -159,7 +159,8 @@ pub(crate) fn translate_word_with_style_reuse_buffers_ascii (
            buffer_to_append_to.push_str(special_case_suffix_lower);
        }
    } else {
-        buffer_to_append_to.push_str(&starting_consonants);
+        //We know this is valid UTF-8 since it is ASCII and ASCII is UTF-8; I'd like to avoid unsafe rust though
+        buffer_to_append_to.push_str(std::str::from_utf8(starting_consonants.as_slice()).unwrap());//(unsafe { std::str::from_utf8_unchecked(starting_consonants.as_slice()) });
        if word_uppercase {
            buffer_to_append_to.push_str(suffix_upper);
        } else {
@ -168,6 +169,83 @@ pub(crate) fn translate_word_with_style_reuse_buffers_ascii (
    }
 }

+pub(crate) fn translate_word_with_style_reuse_buffers_ascii_new (
+    english_word: &[u8],//Assumes this word is not empty
+    suffix_lower: &[u8], special_case_suffix_lower: &[u8], suffix_upper: &[u8], special_case_suffix_upper: &[u8],
+    buffer_to_append_to: &mut Vec<u8>, starting_consonants: &mut Vec<u8>
+) {
+    if english_word.len() == 1 {
+        push_slice_to_vector(buffer_to_append_to, english_word);
+        push_slice_to_vector(buffer_to_append_to, special_case_suffix_lower);
+        return;
+    }
+
+    //TODO more ascii optimizations
+
+    //Check the first letter
+    let first_letter: char = english_word[0] as char;
+
+    let mut index = 1;
+
+    //Check if the word is uppercase
+    let word_uppercase = word_is_uppercase_ascii(english_word);
+
+    //As a herustic, we consider Y to be a vowel when it is not at the start of the word
+    let first_letter_was_vowel: bool = is_vowel(first_letter);//Not including y
+
+    //Clear the starting_consonants buffer we were given
+    starting_consonants.truncate(0);
+
+    if first_letter_was_vowel {
+        buffer_to_append_to.push(english_word[0]);
+    } else {
+        let first_char_was_upper = first_letter.is_ascii_uppercase();
+        starting_consonants.push(if word_uppercase { first_letter as u8 } else { first_letter.to_ascii_lowercase() as u8 });
+
+        //Grab all of the starting consonants, and push the first vowel we enounter to buffer_to_append_to
+        while index < english_word.len() {
+            let character: char = english_word[index] as char;
+            if is_vowel(character) || is_y(character) {//As a herustic, we consider Y to be a vowel when it is not at the start of the word
+                //The vowel is the first letter of the word; we want it match the capitalization of the first letter of the original word
+                if first_char_was_upper {
+                    buffer_to_append_to.push(character.to_ascii_uppercase() as u8);
+                } else {
+                    buffer_to_append_to.push(character.to_ascii_lowercase() as u8);
+                }
+                break;
+            } else {
+                starting_consonants.push(character as u8);
+            }
+
+            index += 1;
+        }
+        index += 1;
+    }
+
+    //Copy all of the remaining letters up to the end of the word
+    while index < english_word.len() {
+        buffer_to_append_to.push(english_word[index]);
+
+        index += 1;
+    }
+
+    //Copy starting consonants and add the suffix, or add the special_case_suffix depending on the circumstances
+    if first_letter_was_vowel {
+        if word_uppercase {
+            push_slice_to_vector(buffer_to_append_to, special_case_suffix_upper);
+        } else {
+            push_slice_to_vector(buffer_to_append_to, special_case_suffix_lower);
+        }
+    } else {
+        push_slice_to_vector(buffer_to_append_to, starting_consonants.as_slice());
+        if word_uppercase {
+            push_slice_to_vector(buffer_to_append_to, suffix_upper);
+        } else {
+            push_slice_to_vector(buffer_to_append_to, suffix_lower);
+        }
+    }
+}
+
 /* Tests */

 #[cfg(test)]
@ -281,7 +359,7 @@ mod tests {
        }

        let mut pig_latin_word = String::with_capacity(64 * 2);//Longer than all English words to avoid unneeded allocations, times 2 to leave room for whitespace, symbols, and the suffix
-        let mut starting_consonants_buffer = String::with_capacity(64);//Longer than basically all English words to avoid unneeded allocations, plus the fact that this isn't the whole word
+        let mut starting_consonants_buffer = Vec::<u8>::with_capacity(64);//Longer than basically all English words to avoid unneeded allocations, plus the fact that this isn't the whole word
        translate_word_with_style_reuse_buffers_ascii (
            english_word,
            suffix_lower, special_case_suffix_lower, &suffix_upper, &special_case_suffix_upper,
@ -383,7 +461,7 @@ mod benches {
    #[bench]
    fn ascii_way_the_word_translator(b: &mut Bencher) {
        let mut pig_latin_word = String::with_capacity(64 * 2);//Longer than all English words to avoid unneeded allocations, times 2 to leave room for whitespace, symbols, and the suffix
-        let mut starting_consonants_buffer = String::with_capacity(64);//Longer than basically all English words to avoid unneeded allocations, plus the fact that this isn't the whole word
+        let mut starting_consonants_buffer = Vec::<u8>::with_capacity(64);//Longer than basically all English words to avoid unneeded allocations, plus the fact that this isn't the whole word

        b.iter(|| {
            let word = test::black_box("translator");
@ -403,7 +481,7 @@ mod benches {
    #[bench]
    fn ascii_yay_the_word_translator(b: &mut Bencher) {
        let mut pig_latin_word = String::with_capacity(64 * 2);//Longer than all English words to avoid unneeded allocations, times 2 to leave room for whitespace, symbols, and the suffix
-        let mut starting_consonants_buffer = String::with_capacity(64);//Longer than basically all English words to avoid unneeded allocations, plus the fact that this isn't the whole word
+        let mut starting_consonants_buffer = Vec::<u8>::with_capacity(64);//Longer than basically all English words to avoid unneeded allocations, plus the fact that this isn't the whole word

        b.iter(|| {
            let word = test::black_box("translator");
@ -423,7 +501,7 @@ mod benches {
    #[bench]
    fn ascii_hay_the_word_translator(b: &mut Bencher) {
        let mut pig_latin_word = String::with_capacity(64 * 2);//Longer than all English words to avoid unneeded allocations, times 2 to leave room for whitespace, symbols, and the suffix
-        let mut starting_consonants_buffer = String::with_capacity(64);//Longer than basically all English words to avoid unneeded allocations, plus the fact that this isn't the whole word
+        let mut starting_consonants_buffer = Vec::<u8>::with_capacity(64);//Longer than basically all English words to avoid unneeded allocations, plus the fact that this isn't the whole word

        b.iter(|| {
            let word = test::black_box("translator");
@ -443,7 +521,7 @@ mod benches {
    #[bench]
    fn ascii_ferb_the_word_translator(b: &mut Bencher) {
        let mut pig_latin_word = String::with_capacity(64 * 2);//Longer than all English words to avoid unneeded allocations, times 2 to leave room for whitespace, symbols, and the suffix
-        let mut starting_consonants_buffer = String::with_capacity(64);//Longer than basically all English words to avoid unneeded allocations, plus the fact that this isn't the whole word
+        let mut starting_consonants_buffer = Vec::<u8>::with_capacity(64);//Longer than basically all English words to avoid unneeded allocations, plus the fact that this isn't the whole word

        b.iter(|| {
            let word = test::black_box("translator");