A simple Rust library to translate from English to Pig Latin!
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
anslatortray-rs/src/byte_string.rs

1060 lines
50 KiB

/* byte_string.rs
* By: John Jekel
* Copyright (C) 2022 John Jekel
* See the LICENSE file at the root of the project for licensing info.
*
* Translation functions operating on &[u8] and Vec::<u8> (higher efficiency, but less user-friendly)
*
*/
//!anslatortray Functions Operating on Byte-Strings
//!
//!As opposed to functions provided in the anslatortray crate's root, which operate on [`&str`] and [`String`], these functions operate on `&[u8]` and [`Vec<u8>`].
//!
//!In performance-sensitive applications, they can allow for some minor optimizations:
//!* One can reuse buffers for getting the result of a translation (as the functions accept a mutable reference to a [`Vec<u8>`] rather than returning data)
//!* One can avoid the penalty of converting to an [`&str`], translating to a [`String`], and having to convert back to raw bytes if one is working solely with byte-strings.
//!
//!Note that both ASCII and UTF-8 byte strings may be passed to these functions, and that valid ASCII/UTF-8 will be returned.
//!In the past there were "ASCII-only" functions that operated on [`String`]s, but these were removed.
//!These byte_string functions are NOT the sucessors of those functions.
//!Without exception, ALL functions in anslatortray accept both ASCII and UTF-8 text, regardless of whether they operate on byte-strings or char-strings.
//!The modern functions present are faster than the old ASCII ones anyways, even the ones in the crate's root that don't operate on byte-strings.
/* Imports */
use std::num::Wrapping;
/* Functions */
///Translates a multi-word string (including punctuation) into Pig Latin!
///
///Uses the default suffix and special_case_suffix, "ay" and "way" respectively when calling [`translate_with_style()`].
///
///Equivalent to [`translate_way()`].
///
///Note: The resulting translation is appended to the provided buffer, so one may wish to ensure it is cleared before each use or not depending on the application.
///
///# Examples
///
///```
///use anslatortray::byte_string::translate;
///
///let mut buffer = Vec::<u8>::new();
///
///translate(b"Hello world from the coolest Pig Latin translator!", &mut buffer);
///assert_eq!(&buffer, b"Ellohay orldway omfray ethay oolestcay Igpay Atinlay anslatortray!");
///
///buffer.truncate(0);
///translate(b"This library can translate any English text. It can even handle multiple sentences!", &mut buffer);
///assert_eq!(&buffer, b"Isthay ibrarylay ancay anslatetray anyway Englishway exttay. Itway ancay evenway andlehay ultiplemay entencessay!");
///
///buffer.truncate(0);
///translate(b"Let's try some edge cases. That is a contraction, as well as a word where the only vowel is y. Neat, all that works!", &mut buffer);
///assert_eq!(&buffer, b"Etlay's ytray omesay edgeway asescay. Atthay isway away ontractioncay, asway ellway asway away ordway erewhay ethay onlyway owelvay isway yway. Eatnay, allway atthay orksway!");
///
///buffer.truncate(0);
///translate(b"What if a word has no vowels, like this: bcdfghjklmnpqrstvwxz", &mut buffer);
///assert_eq!(&buffer, b"Atwhay ifway away ordway ashay onay owelsvay, ikelay isthay: bcdfghjklmnpqrstvwxzay");
///
///buffer.truncate(0);
///translate(b"Cool, so the heuristics make pretty good guesses with what they're fed!", &mut buffer);
///assert_eq!(&buffer, b"Oolcay, osay ethay euristicshay akemay ettypray oodgay uessesgay ithway atwhay eythay're edfay!");
///
///buffer.truncate(0);
///translate(b"Hello-world", &mut buffer);
///assert_eq!(&buffer, b"Ellohay-orldway");
///
///buffer.truncate(0);
///translate(b"Hyphens-are-difficult-aren't-they?", &mut buffer);
///assert_eq!(&buffer, b"Yphenshay-areway-ifficultday-arenway't-eythay?");
///
///buffer.truncate(0);
///translate(b"The buffer isn't cleared by the translate function beforehand, ", &mut buffer);
///translate(b"so we can do something like this if we wish!", &mut buffer);
///assert_eq!(&buffer, b"Ethay ufferbay isnway't earedclay ybay ethay anslatetray unctionfay eforehandbay, osay eway ancay oday omethingsay ikelay isthay ifway eway ishway!");
///```
pub fn translate(english: &[u8], pig_latin_string: &mut Vec::<u8>) {
translate_way(english, pig_latin_string);
}
///Translates a multi-word string (including punctuation) into Pig Latin (way-style)!
///
///Uses the suffix and special_case_suffix "ay" and "way" respectively when calling [`translate_with_style()`].
///
///Note: The resulting translation is appended to the provided buffer, so one may wish to ensure it is cleared before each use or not depending on the application.
///
///# Examples
///
///```
///use anslatortray::byte_string::translate_way;
///
///let mut buffer = Vec::<u8>::new();
///
///translate_way(b"Hello world from the coolest Pig Latin translator!", &mut buffer);
///assert_eq!(&buffer, b"Ellohay orldway omfray ethay oolestcay Igpay Atinlay anslatortray!");
///
///buffer.truncate(0);
///translate_way(b"This library can translate any English text. It can even handle multiple sentences!", &mut buffer);
///assert_eq!(&buffer, b"Isthay ibrarylay ancay anslatetray anyway Englishway exttay. Itway ancay evenway andlehay ultiplemay entencessay!");
///
///buffer.truncate(0);
///translate_way(b"Let's try some edge cases. That is a contraction, as well as a word where the only vowel is y. Neat, all that works!", &mut buffer);
///assert_eq!(&buffer, b"Etlay's ytray omesay edgeway asescay. Atthay isway away ontractioncay, asway ellway asway away ordway erewhay ethay onlyway owelvay isway yway. Eatnay, allway atthay orksway!");
///
///buffer.truncate(0);
///translate_way(b"What if a word has no vowels, like this: bcdfghjklmnpqrstvwxz", &mut buffer);
///assert_eq!(&buffer, b"Atwhay ifway away ordway ashay onay owelsvay, ikelay isthay: bcdfghjklmnpqrstvwxzay");
///
///buffer.truncate(0);
///translate_way(b"Cool, so the heuristics make pretty good guesses with what they're fed!", &mut buffer);
///assert_eq!(&buffer, b"Oolcay, osay ethay euristicshay akemay ettypray oodgay uessesgay ithway atwhay eythay're edfay!");
///
///buffer.truncate(0);
///translate_way(b"Hello-world", &mut buffer);
///assert_eq!(&buffer, b"Ellohay-orldway");
///
///buffer.truncate(0);
///translate_way(b"Hyphens-are-difficult-aren't-they?", &mut buffer);
///assert_eq!(&buffer, b"Yphenshay-areway-ifficultday-arenway't-eythay?");
///
///buffer.truncate(0);
///translate_way(b"The buffer isn't cleared by the translate function beforehand, ", &mut buffer);
///translate_way(b"so we can do something like this if we wish!", &mut buffer);
///assert_eq!(&buffer, b"Ethay ufferbay isnway't earedclay ybay ethay anslatetray unctionfay eforehandbay, osay eway ancay oday omethingsay ikelay isthay ifway eway ishway!");
///```
pub fn translate_way(english: &[u8], pig_latin_string: &mut Vec::<u8>) {
translate_with_style_lower_and_upper_suffixes(english, b"ay", b"way", b"AY", b"WAY", pig_latin_string);
}
///Translates a multi-word string (including punctuation) into Pig Latin (yay-style)!
///
///Uses the suffix and special_case_suffix "ay" and "yay" respectively when calling [`translate_with_style()`].
///
///Note: The resulting translation is appended to the provided buffer, so one may wish to ensure it is cleared before each use or not depending on the application.
///
///# Examples
///
///```
///use anslatortray::byte_string::translate_yay;
///
///let mut buffer = Vec::<u8>::new();
///
///translate_yay(b"Hello world from the coolest Pig Latin translator!", &mut buffer);
///assert_eq!(&buffer, b"Ellohay orldway omfray ethay oolestcay Igpay Atinlay anslatortray!");
///
///buffer.truncate(0);
///translate_yay(b"This library can translate any English text. It can even handle multiple sentences!", &mut buffer);
///assert_eq!(&buffer, b"Isthay ibrarylay ancay anslatetray anyyay Englishyay exttay. Ityay ancay evenyay andlehay ultiplemay entencessay!");
///
///buffer.truncate(0);
///translate_yay(b"Let's try some edge cases. That is a contraction, as well as a word where the only vowel is y. Neat, all that works!", &mut buffer);
///assert_eq!(&buffer, b"Etlay's ytray omesay edgeyay asescay. Atthay isyay ayay ontractioncay, asyay ellway asyay ayay ordway erewhay ethay onlyyay owelvay isyay yyay. Eatnay, allyay atthay orksway!");
///
///buffer.truncate(0);
///translate_yay(b"What if a word has no vowels, like this: bcdfghjklmnpqrstvwxz", &mut buffer);
///assert_eq!(&buffer, b"Atwhay ifyay ayay ordway ashay onay owelsvay, ikelay isthay: bcdfghjklmnpqrstvwxzay");
///
///buffer.truncate(0);
///translate_yay(b"Cool, so the heuristics make pretty good guesses with what they're fed!", &mut buffer);
///assert_eq!(&buffer, b"Oolcay, osay ethay euristicshay akemay ettypray oodgay uessesgay ithway atwhay eythay're edfay!");
///
///buffer.truncate(0);
///translate_yay(b"Hello-world", &mut buffer);
///assert_eq!(&buffer, b"Ellohay-orldway");
///
///buffer.truncate(0);
///translate_yay(b"Hyphens-are-difficult-aren't-they?", &mut buffer);
///assert_eq!(&buffer, b"Yphenshay-areyay-ifficultday-arenyay't-eythay?");
///
///buffer.truncate(0);
///translate_yay(b"The buffer isn't cleared by the translate function beforehand, ", &mut buffer);
///translate_yay(b"so we can do something like this if we wish!", &mut buffer);
///assert_eq!(&buffer, b"Ethay ufferbay isnyay't earedclay ybay ethay anslatetray unctionfay eforehandbay, osay eway ancay oday omethingsay ikelay isthay ifyay eway ishway!");
///```
pub fn translate_yay(english: &[u8], pig_latin_string: &mut Vec::<u8>) {
translate_with_style_lower_and_upper_suffixes(english, b"ay", b"yay", b"AY", b"WAY", pig_latin_string);
}
///Translates a multi-word string (including punctuation) into Pig Latin (hay-style)!
///
///Uses the suffix and special_case_suffix "ay" and "hay" respectively when calling [`translate_with_style()`].
///
///Note: The resulting translation is appended to the provided buffer, so one may wish to ensure it is cleared before each use or not depending on the application.
///
///# Examples
///
///```
///use anslatortray::byte_string::translate_hay;
///
///let mut buffer = Vec::<u8>::new();
///
///translate_hay(b"Hello world from the coolest Pig Latin translator!", &mut buffer);
///assert_eq!(&buffer, b"Ellohay orldway omfray ethay oolestcay Igpay Atinlay anslatortray!");
///
///buffer.truncate(0);
///translate_hay(b"This library can translate any English text. It can even handle multiple sentences!", &mut buffer);
///assert_eq!(&buffer, b"Isthay ibrarylay ancay anslatetray anyhay Englishhay exttay. Ithay ancay evenhay andlehay ultiplemay entencessay!");
///
///buffer.truncate(0);
///translate_hay(b"Let's try some edge cases. That is a contraction, as well as a word where the only vowel is y. Neat, all that works!", &mut buffer);
///assert_eq!(&buffer, b"Etlay's ytray omesay edgehay asescay. Atthay ishay ahay ontractioncay, ashay ellway ashay ahay ordway erewhay ethay onlyhay owelvay ishay yhay. Eatnay, allhay atthay orksway!");
///
///buffer.truncate(0);
///translate_hay(b"What if a word has no vowels, like this: bcdfghjklmnpqrstvwxz", &mut buffer);
///assert_eq!(&buffer, b"Atwhay ifhay ahay ordway ashay onay owelsvay, ikelay isthay: bcdfghjklmnpqrstvwxzay");
///
///buffer.truncate(0);
///translate_hay(b"Cool, so the heuristics make pretty good guesses with what they're fed!", &mut buffer);
///assert_eq!(&buffer, b"Oolcay, osay ethay euristicshay akemay ettypray oodgay uessesgay ithway atwhay eythay're edfay!");
///
///buffer.truncate(0);
///translate_hay(b"Hello-world", &mut buffer);
///assert_eq!(&buffer, b"Ellohay-orldway");
///
///buffer.truncate(0);
///translate_hay(b"Hyphens-are-difficult-aren't-they?", &mut buffer);
///assert_eq!(&buffer, b"Yphenshay-arehay-ifficultday-arenhay't-eythay?");
///
///buffer.truncate(0);
///translate_hay(b"The buffer isn't cleared by the translate function beforehand, ", &mut buffer);
///translate_hay(b"so we can do something like this if we wish!", &mut buffer);
///assert_eq!(&buffer, b"Ethay ufferbay isnhay't earedclay ybay ethay anslatetray unctionfay eforehandbay, osay eway ancay oday omethingsay ikelay isthay ifhay eway ishway!");
///```
pub fn translate_hay(english: &[u8], pig_latin_string: &mut Vec::<u8>) {
translate_with_style_lower_and_upper_suffixes(english, b"ay", b"hay", b"AY", b"HAY", pig_latin_string);
}
///Translates a multi-word string (including punctuation) into Ferb Latin!
///
///Uses the suffix and special_case_suffix "erb" and "ferb" respectively when calling [`translate_with_style()`].
///
///Note: The resulting translation is appended to the provided buffer, so one may wish to ensure it is cleared before each use or not depending on the application.
///
///# Examples
///
///```
///use anslatortray::byte_string::translate_ferb;
///
///let mut buffer = Vec::<u8>::new();
///
///translate_ferb(b"Hello world from the coolest Pig Latin translator!", &mut buffer);
///assert_eq!(&buffer, b"Elloherb orldwerb omfrerb etherb oolestcerb Igperb Atinlerb anslatortrerb!");
///
///buffer.truncate(0);
///translate_ferb(b"This library can translate any English text. It can even handle multiple sentences!", &mut buffer);
///assert_eq!(&buffer, b"Istherb ibrarylerb ancerb anslatetrerb anyferb Englishferb extterb. Itferb ancerb evenferb andleherb ultiplemerb entencesserb!");
///
///buffer.truncate(0);
///translate_ferb(b"Let's try some edge cases. That is a contraction, as well as a word where the only vowel is y. Neat, all that works!", &mut buffer);
///assert_eq!(&buffer, b"Etlerb's ytrerb omeserb edgeferb asescerb. Attherb isferb aferb ontractioncerb, asferb ellwerb asferb aferb ordwerb erewherb etherb onlyferb owelverb isferb yferb. Eatnerb, allferb attherb orkswerb!");
///
///buffer.truncate(0);
///translate_ferb(b"What if a word has no vowels, like this: bcdfghjklmnpqrstvwxz", &mut buffer);
///assert_eq!(&buffer, b"Atwherb ifferb aferb ordwerb asherb onerb owelsverb, ikelerb istherb: bcdfghjklmnpqrstvwxzerb");
///
///buffer.truncate(0);
///translate_ferb(b"Cool, so the heuristics make pretty good guesses with what they're fed!", &mut buffer);
///assert_eq!(&buffer, b"Oolcerb, oserb etherb euristicsherb akemerb ettyprerb oodgerb uessesgerb ithwerb atwherb eytherb're edferb!");
///
///buffer.truncate(0);
///translate_ferb(b"Hello-world", &mut buffer);
///assert_eq!(&buffer, b"Elloherb-orldwerb");
///
///buffer.truncate(0);
///translate_ferb(b"Hyphens-are-difficult-aren't-they?", &mut buffer);
///assert_eq!(&buffer, b"Yphensherb-areferb-ifficultderb-arenferb't-eytherb?");
///
///buffer.truncate(0);
///translate_ferb(b"The buffer isn't cleared by the translate function beforehand, ", &mut buffer);
///translate_ferb(b"so we can do something like this if we wish!", &mut buffer);
///assert_eq!(&buffer, b"Etherb ufferberb isnferb't earedclerb yberb etherb anslatetrerb unctionferb eforehandberb, oserb ewerb ancerb oderb omethingserb ikelerb istherb ifferb ewerb ishwerb!");
///```
pub fn translate_ferb(english: &[u8], pig_latin_string: &mut Vec::<u8>) {
translate_with_style_lower_and_upper_suffixes(english, b"erb", b"ferb", b"ERB", b"FERB", pig_latin_string);
}
///Translates a multi-word string (including punctuation) into a custom-styled play language!
///
///Pass the string you wish to translate, the suffix you wish to have appended to most words, and the suffix
///you wish to have appended in various special-cases (such as when a word is only one letter or starts with a vowel).
///
///Note: The suffixes must be entirely lower-case or weird results may occur.
///
///Note: The resulting translation is appended to the provided buffer, so one may wish to ensure it is cleared before each use or not depending on the application.
///
///# Examples
///
///```
///use anslatortray::byte_string::translate_with_style;
///
///let suffix = b"ancy";
///let special_case_suffix = b"fancy";
///
///let mut buffer = Vec::<u8>::new();
///
///translate_with_style(b"Hello world from the coolest Pig Latin translator!", suffix, special_case_suffix, &mut buffer);
///assert_eq!(&buffer, b"Ellohancy orldwancy omfrancy ethancy oolestcancy Igpancy Atinlancy anslatortrancy!");
///
///buffer.truncate(0);
///translate_with_style(b"This library can translate any English text. It can even handle multiple sentences!", suffix, special_case_suffix, &mut buffer);
///assert_eq!(&buffer, b"Isthancy ibrarylancy ancancy anslatetrancy anyfancy Englishfancy exttancy. Itfancy ancancy evenfancy andlehancy ultiplemancy entencessancy!");
///
///buffer.truncate(0);
///translate_with_style(b"Let's try some edge cases. That is a contraction, as well as a word where the only vowel is y. Neat, all that works!", suffix, special_case_suffix, &mut buffer);
///assert_eq!(&buffer, b"Etlancy's ytrancy omesancy edgefancy asescancy. Atthancy isfancy afancy ontractioncancy, asfancy ellwancy asfancy afancy ordwancy erewhancy ethancy onlyfancy owelvancy isfancy yfancy. Eatnancy, allfancy atthancy orkswancy!");
///
///buffer.truncate(0);
///translate_with_style(b"What if a word has no vowels, like this: bcdfghjklmnpqrstvwxz", suffix, special_case_suffix, &mut buffer);
///assert_eq!(&buffer, b"Atwhancy iffancy afancy ordwancy ashancy onancy owelsvancy, ikelancy isthancy: bcdfghjklmnpqrstvwxzancy");
///
///buffer.truncate(0);
///translate_with_style(b"Cool, so the heuristics make pretty good guesses with what they're fed!", suffix, special_case_suffix, &mut buffer);
///assert_eq!(&buffer, b"Oolcancy, osancy ethancy euristicshancy akemancy ettyprancy oodgancy uessesgancy ithwancy atwhancy eythancy're edfancy!");
///
///buffer.truncate(0);
///translate_with_style(b"Hello-world", suffix, special_case_suffix, &mut buffer);
///assert_eq!(&buffer, b"Ellohancy-orldwancy");
///
///buffer.truncate(0);
///translate_with_style(b"Hyphens-are-difficult-aren't-they?", suffix, special_case_suffix, &mut buffer);
///assert_eq!(&buffer, b"Yphenshancy-arefancy-ifficultdancy-arenfancy't-eythancy?");
///
///buffer.truncate(0);
///translate_with_style(b"The buffer isn't cleared by the translate function beforehand, ", suffix, special_case_suffix, &mut buffer);
///translate_with_style(b"so we can do something like this if we wish!", suffix, special_case_suffix, &mut buffer);
///assert_eq!(&buffer, b"Ethancy ufferbancy isnfancy't earedclancy ybancy ethancy anslatetrancy unctionfancy eforehandbancy, osancy ewancy ancancy odancy omethingsancy ikelancy isthancy iffancy ewancy ishwancy!");
///```
pub fn translate_with_style(english: &[u8], suffix_lower: &[u8], special_case_suffix_lower: &[u8], pig_latin_string: &mut Vec::<u8>) {
//Convert the suffix and special_case_suffix we were provided to uppercase for words that are capitalized
let mut suffix_upper = Vec::<u8>::with_capacity(suffix_lower.len());
for letter in suffix_lower.iter() {
suffix_upper.push(letter.to_ascii_uppercase());//NOTE: We can't use fast_to_ascii_uppercase in case the suffixes contain UTF-8 or non-letters
}
let mut special_case_suffix_upper = Vec::<u8>::with_capacity(special_case_suffix_lower.len());
for letter in special_case_suffix_lower.iter() {
special_case_suffix_upper.push(letter.to_ascii_uppercase());//NOTE: We can't use fast_to_ascii_uppercase in case the suffixes contain UTF-8 or non-letters
}
translate_with_style_lower_and_upper_suffixes(english, suffix_lower, special_case_suffix_lower, &suffix_upper, &special_case_suffix_upper, pig_latin_string);
}
//Avoids the overhead of having to convert suffixes to uppercase for the standard translation functions at runtime
pub(crate) fn translate_with_style_lower_and_upper_suffixes (
english: &[u8],
suffix_lower: &[u8], special_case_suffix_lower: &[u8], suffix_upper: &[u8], special_case_suffix_upper: &[u8],
pig_latin_string: &mut Vec::<u8>
) {
if english.is_empty() {
return;
}
let mut global_index: usize = 0;
loop {
//Copies characters in-between words
//TODO this could probably be optimized with vector instructions
{
let mut start_of_in_between_words_index: usize = global_index;//Inclusive
loop {
if english[global_index].is_ascii_alphabetic() {//Start of a word
break;
}
global_index += 1;
if global_index == english.len() {
//Copy all of the characters so far (all that remain) and return
let remaining_characters_slice = &english[start_of_in_between_words_index..];
pig_latin_string.extend_from_slice(remaining_characters_slice);
return;
}
}
//Copy the characters in-between words as-is
let in_between_words_characters_slice = &english[start_of_in_between_words_index..global_index];
pig_latin_string.extend_from_slice(in_between_words_characters_slice);
//At this point, global_index contains the index to the start of the word to translate
}
//Translates the current word
{
let word_start_index = global_index;
let first_letter = english[word_start_index];
global_index += 1;
if (global_index == english.len()) || (!english[global_index].is_ascii_alphabetic()) {//The word is only one letter long (special case)
//Push the letter and add the lowercase special suffix (even if the letter is uppercase)
pig_latin_string.push(first_letter);
pig_latin_string.extend_from_slice(special_case_suffix_lower);
} else if is_vowel(first_letter) {//The word is longer than a letter and starts with a vowel (special case)
//As a heuristic, we consider Y to be a vowel when it is not at the start of the word
//Get the slice containing the whole word
let slice_to_search_for_end = &english[global_index..];
let word_slice: &[u8];
if let Some(found_end_of_word_index) = slice_to_search_for_end.iter().position(|&x| !x.is_ascii_alphabetic()) {//We found a non-letter that ends the word
global_index += found_end_of_word_index;
word_slice = &english[word_start_index..global_index];
} else {//The string ended
global_index = english.len();
word_slice = slice_to_search_for_end;
}
//Translate the word and push it
pig_latin_string.extend_from_slice(word_slice);
if fast_is_ascii_uppercase(english[word_start_index + 1]) {//As a heuristic, we consider the word to be uppercase if the second letter is
pig_latin_string.extend_from_slice(special_case_suffix_upper);
} else {//Word is entirely lowercase, or its first letter is uppercase only
pig_latin_string.extend_from_slice(special_case_suffix_lower);
}
} else {//The word is longer than a letter and doesn't start with a vowel
//Find the first vowel; we assume the word actually has a vowel in it
let first_vowel_index: usize;
let slice_to_search_for_vowel = &english[global_index..];
if let Some(first_vowel_of_word_index) = slice_to_search_for_vowel.iter().position(|&x| { is_vowel(x) || is_y(x) }) {//As a heuristic, we consider Y to be a vowel when it is not at the start of the word
global_index += first_vowel_of_word_index;
} else {//This string ended and we never found a vowel
return;//Just give up
}
first_vowel_index = global_index;
//Find the end of the word
let word_end_index: usize;
let slice_to_search_for_end = &english[global_index..];
if let Some(end_of_word_index) = slice_to_search_for_end.iter().position(|&x| !x.is_ascii_alphabetic()) {//We found a non-letter that ends the word
global_index += end_of_word_index;
} else {//The string ended
global_index = english.len();
}
word_end_index = global_index;
//Translate the word
//TODO improve code reuse here
if fast_is_ascii_uppercase(first_letter) {//Check if the first letter is uppercase
if fast_is_ascii_uppercase(english[word_start_index + 1]) {//As a heuristic, we consider the word to be uppercase if the second letter is
//Push the vowel and all letters after it
let vowel_to_end_slice = &english[first_vowel_index..word_end_index];
pig_latin_string.extend_from_slice(vowel_to_end_slice);
//Push the starting consonants
let start_to_vowel_slice = &english[word_start_index..first_vowel_index];
pig_latin_string.extend_from_slice(start_to_vowel_slice);
//Push the normal suffix (uppercase)
pig_latin_string.extend_from_slice(suffix_upper);
} else {//Word starts with an uppercase letter, but is otherwise lowercase
//Push the vowel, matching the starting case of the original word
pig_latin_string.push(fast_to_ascii_uppercase(english[first_vowel_index]));
//Push all letters after the vowel
let after_vowel_slice = &english[(first_vowel_index + 1)..word_end_index];
pig_latin_string.extend_from_slice(after_vowel_slice);
//Push the first starting consonant, which should be lowercase now
pig_latin_string.push(fast_to_ascii_lowercase(english[word_start_index]));
//Push the remaining starting consonants
let after_start_to_vowel_slice = &english[(word_start_index + 1)..first_vowel_index];
pig_latin_string.extend_from_slice(after_start_to_vowel_slice);
//Push the normal suffix
pig_latin_string.extend_from_slice(suffix_lower);
}
} else {//Word is entirely lowercase
//Push the vowel and all letters after it
let vowel_to_end_slice = &english[first_vowel_index..word_end_index];
pig_latin_string.extend_from_slice(vowel_to_end_slice);
//Push the starting consonants
let start_to_vowel_slice = &english[word_start_index..first_vowel_index];
pig_latin_string.extend_from_slice(start_to_vowel_slice);
//Push the normal suffix (lowercase)
pig_latin_string.extend_from_slice(suffix_lower);
}
}
//Don't go on if we reached the end of the string during the word
if global_index == english.len() {
return;
}
//At this point, global_index contains the index to the next character to check
}
//Copies contraction suffixes, if present
if english[global_index] == b'\'' {//TODO if this is true we can also skip the regular inter-word loop on the next iteration
let mut start_of_contraction_suffix_index: usize = global_index;//Inclusive
global_index += 1;//We skip over the apostrophe for the loop below, but we still want to copy it in the end
loop {
if global_index == english.len() {
//Copy all of the characters so far (all that remain) and return
let remaining_characters_slice = &english[start_of_contraction_suffix_index..];
pig_latin_string.extend_from_slice(remaining_characters_slice);
return;
}
if !english[global_index].is_ascii_alphabetic() {//End of the contraction suffix
break;
}
global_index += 1;
}
//Copy the contraction suffix as-is
let contraction_suffix_slice = &english[start_of_contraction_suffix_index..global_index];
pig_latin_string.extend_from_slice(contraction_suffix_slice);
}
}
}
//Avoids the overhead of having to convert suffixes to uppercase for the standard translation functions at runtime
pub(crate) fn translate_with_style_lower_and_upper_suffixes_old (
english: &[u8],
suffix_lower: &[u8], special_case_suffix_lower: &[u8], suffix_upper: &[u8], special_case_suffix_upper: &[u8],
pig_latin_string: &mut Vec::<u8>
) {
if english.is_empty() {
return;
}
//TODO merge the word and the generic text function into one function to allow for optimizations with certain things
//TODO do an SSE/AVX optimized version of this
//Flags used to remember if we're currently processing a word, contraction, contraction suffix or neither
//TODO can we avoid needing these flags and be more efficient?
let mut in_word: bool = false;
let mut in_contraction_suffix: bool = false;
//Indexes for improved performance (avoid copying characters to use as the english_word argument for translate_word_with_style_reuse_buffers)
//However, this assumes each character is one byte, so this only works with ASCII strings
let mut slice_start_index: usize = 0;//Inclusive
let mut slice_end_index: usize = 0;//Exclusive
for character in english.iter() {
if in_word {
if in_contraction_suffix {
if character.is_ascii_alphabetic() {
//We never translate the contraction suffix of a word, so just copy remaining letters as-is
} else {
//The contraction ended, and so too does the word
//We still want to copy the non-letter to the output though
in_contraction_suffix = false;
in_word = false;
}
pig_latin_string.push(*character);//Copy the character
slice_start_index += 1;//Keep the slice start index up to speed for later use
} else {
if character.is_ascii_alphabetic() {
//This character is part of the word, so increment the slice_end_index to include it in the slice
slice_end_index += 1;
} else {
//The word or first part of the contraction ended, so translate the word we've identified up until this point!
let word_slice: &[u8] = &english[slice_start_index..slice_end_index];
translate_word_with_style_reuse_buffers (
word_slice,
suffix_lower, special_case_suffix_lower, suffix_upper, special_case_suffix_upper,
pig_latin_string
);
//Bring the slice_start_index to the end since we've finished the word and need it ready for the next one
slice_start_index = slice_end_index + 1;
//Append the symbol/whitespace we just got after the translated word
pig_latin_string.push(*character);
//If the symbol/whitespace we just got is an apostrophe, then this is a contraction suffix
if *character == b'\'' {
in_contraction_suffix = true;
} else {
in_word = false;//This wasn't a contraction, so we're done with the word
}
}
}
} else {
if character.is_ascii_alphabetic() {
//If we see a letter, we are in a word, so set the slice_end_index to the character after the slice_start_index
in_word = true;
slice_end_index = slice_start_index + 1;
} else {
//Otherwise copy symbols and whitespace as-is
pig_latin_string.push(*character);
slice_start_index += 1;
}
}
}
//If we ended on a word (but not on a contraction suffix), we translate it and push it to the end of the string
if in_word && !in_contraction_suffix {
let word_slice: &[u8] = &english[slice_start_index..slice_end_index];
translate_word_with_style_reuse_buffers (
word_slice,
suffix_lower, special_case_suffix_lower, suffix_upper, special_case_suffix_upper,
pig_latin_string
);
}
}
//Translate a word (english_word MUST ONLY CONTAIN ASCII LETTERS, not numbers/symbols/etc or anything UTF-8)
#[inline(always)]//Only used by the one function in this module, so this makes sense
fn translate_word_with_style_reuse_buffers (
english_word: &[u8],//Assumes this word is not empty
suffix_lower: &[u8], special_case_suffix_lower: &[u8], suffix_upper: &[u8], special_case_suffix_upper: &[u8],
buffer_to_append_to: &mut Vec<u8>
) {
//Assume the word is at least 1 letter
debug_assert!(english_word.len() != 0);
if english_word.len() == 0 {
unsafe {
std::hint::unreachable_unchecked();
}
}
//Special case for 1-letter words
if english_word.len() == 1 {//TODO annotate this branch as unlikely taken
//TODO it may be better to chain these back to back in a single call so the vector gets a hint with how much it needs to resize for both at once
//See https://stackoverflow.com/questions/71785682/calling-extend-from-slice-multiple-times
buffer_to_append_to.extend_from_slice(english_word);
buffer_to_append_to.extend_from_slice(special_case_suffix_lower);
return;
}
//Check if the word is uppercase
let word_uppercase = word_is_uppercase(english_word);
//As a herustic, we consider Y to be a vowel when it is not at the start of the word
if is_vowel(english_word[0]) {//Not including y//TODO annotate this branch as unlikely taken
buffer_to_append_to.extend_from_slice(english_word);
if word_uppercase {
buffer_to_append_to.extend_from_slice(special_case_suffix_upper);
} else {
buffer_to_append_to.extend_from_slice(special_case_suffix_lower);
}
return;
}
//Find the index of the first vowel, skipping index 1 since that was handled above
let mut index_of_first_vowel: usize = 1;
while index_of_first_vowel < english_word.len() {
let character: u8 = english_word[index_of_first_vowel];
if is_vowel(character) || is_y(character) {//As a herustic, we consider Y to be a vowel when it is not at the start of the word
break;
}
index_of_first_vowel += 1;
}
//Now that we know where the first vowel is and if the word is uppercase, we can construct the pig-latin word
if index_of_first_vowel < english_word.len() {//We found a vowel//TODO mark this branch as likely taken
//Push the first vowel to the new pig latin string. If the first letter was capitalized originally, match the case
if fast_is_ascii_uppercase(english_word[0]) {
buffer_to_append_to.push(fast_to_ascii_uppercase(english_word[index_of_first_vowel]));
} else {
buffer_to_append_to.push(english_word[index_of_first_vowel]);
}
//Copy the remaining letters in the word after the vowel
buffer_to_append_to.extend_from_slice(&english_word[(index_of_first_vowel + 1)..]);
//If the first letter (a consonant) was uppercase, it no longer needs to be (since the vowel above is now at the start and capitalized)
//Unless, of course, the whole word is uppercase, in which case it should be left alone
buffer_to_append_to.push(if word_uppercase { english_word[0] } else { fast_to_ascii_lowercase(english_word[0]) });
//Copy the remaining starting consonants
buffer_to_append_to.extend_from_slice(&english_word[1..index_of_first_vowel]);
} else {//This word dosn't have a vowel
//Just copy it as-is then
buffer_to_append_to.extend_from_slice(english_word);
}
//Add the regular suffixes
if word_uppercase {//TODO annotate this branch as unlikely taken
buffer_to_append_to.extend_from_slice(suffix_upper);
} else {
buffer_to_append_to.extend_from_slice(suffix_lower);
}
}
//Returns whether a letter is a vowel or not.
#[inline(always)]//Only used by the one function in this module, so this makes sense
fn is_vowel(letter: u8) -> bool {
match letter {
b'a' | b'e' | b'i' | b'o' | b'u' | b'A' | b'E' | b'I' | b'O' | b'U' => { return true; }
_ => { return false; }
}
}
//Returns whether a letter is y or not.
#[inline(always)]//Only used by the one function in this module, so this makes sense
fn is_y(letter: u8) -> bool {
return (letter == b'y') || (letter == b'Y');
}
//Returns whether an entire word is upper case or not.
#[inline(always)]//Only used by the one function in this module, so this makes sense
fn word_is_uppercase(english_word: &[u8]) -> bool {
//Asume length is non-zero
debug_assert!(english_word.len() != 0);
if english_word.len() == 0 {
unsafe {
std::hint::unreachable_unchecked();
}
}
//Heuristic: If the last letter of the word is uppercase, likely the whole word is uppercase
return fast_is_ascii_uppercase(english_word[english_word.len() - 1]);
}
//NOTE the result is undefined if the character is not a letter
#[inline(always)]//Only used by the one function in this module, so this makes sense
fn fast_is_ascii_uppercase(letter: u8) -> bool {
return letter <= b'Z';
}
//NOTE the result is undefined if the character is not a letter
/*#[inline(always)]//Only used by the one function in this module, so this makes sense
fn fast_is_ascii_lowercase(letter: u8) -> bool {
return letter >= b'a';
}*/
//NOTE if the character is not an ascii letter, this may produce invalid UTF-8
#[inline(always)]//Only used by the one function in this module, so this makes sense
fn fast_to_ascii_uppercase(letter: u8) -> u8 {
return if letter >= b'a' { (Wrapping(letter) - Wrapping(0x20)).0 } else { letter };
}
//NOTE if the character is not an ascii letter, this may produce invalid UTF-8
#[inline(always)]//Only used by the one function in this module, so this makes sense
fn fast_to_ascii_lowercase(letter: u8) -> u8 {
return if letter <= b'Z' { (Wrapping(letter) + Wrapping(0x20)).0 } else { letter };
}
/* Tests */
#[cfg(test)]
mod tests {
use super::*;
//NOTE: We don't test byte_string::translate_with_style and other similar functions in here directly since we test them through string.rs
//TODO test uppercase words
#[test]
fn test_translate_word_with_style() {
let suffix_special_case_suffix_pairs = [
("ay", "way"), ("ay", "yay"), ("ay", "hay"), ("erb", "ferb"), ("ancy", "fancy"), ("orange", "porange"), ("anana", "banana"), ("atin", "latin"), ("ust", "rust")
];
for pair in suffix_special_case_suffix_pairs {
let suffix = pair.0;
let special_case_suffix = pair.1;
assert_eq!(translate_word_with_style("Hello", suffix, special_case_suffix), "Elloh".to_string() + suffix);
assert_eq!(translate_word_with_style("World", suffix, special_case_suffix), "Orldw".to_string() + suffix);
assert_eq!(translate_word_with_style("This", suffix, special_case_suffix), "Isth".to_string() + suffix);
assert_eq!(translate_word_with_style("is", suffix, special_case_suffix), "is".to_string() + special_case_suffix);
assert_eq!(translate_word_with_style("a", suffix, special_case_suffix), "a".to_string() + special_case_suffix);
assert_eq!(translate_word_with_style("test", suffix, special_case_suffix), "estt".to_string() + suffix);
assert_eq!(translate_word_with_style("of", suffix, special_case_suffix), "of".to_string() + special_case_suffix);
assert_eq!(translate_word_with_style("the", suffix, special_case_suffix), "eth".to_string() + suffix);
assert_eq!(translate_word_with_style("function", suffix, special_case_suffix), "unctionf".to_string() + suffix);
assert_eq!(translate_word_with_style("translate", suffix, special_case_suffix), "anslatetr".to_string() + suffix);
assert_eq!(translate_word_with_style("word", suffix, special_case_suffix), "ordw".to_string() + suffix);
assert_eq!(translate_word_with_style("I", suffix, special_case_suffix), "I".to_string() + special_case_suffix);
assert_eq!(translate_word_with_style("Love", suffix, special_case_suffix), "Ovel".to_string() + suffix);
assert_eq!(translate_word_with_style("Pig", suffix, special_case_suffix), "Igp".to_string() + suffix);
assert_eq!(translate_word_with_style("Latin", suffix, special_case_suffix), "Atinl".to_string() + suffix);
assert_eq!(translate_word_with_style("You", suffix, special_case_suffix), "Ouy".to_string() + suffix);//Y isn't a vowel here
assert_eq!(translate_word_with_style("should", suffix, special_case_suffix), "ouldsh".to_string() + suffix);
assert_eq!(translate_word_with_style("try", suffix, special_case_suffix), "ytr".to_string() + suffix);//Y is a vowel here
assert_eq!(translate_word_with_style("yougurt", suffix, special_case_suffix), "ougurty".to_string() + suffix);//Y isn't a vowel here
//assert_eq!(translate_word_with_style("it's", suffix, special_case_suffix), "it".to_string() + special_case_suffix + "'s");//Contraction
assert_eq!(translate_word_with_style("quite", suffix, special_case_suffix), "uiteq".to_string() + suffix);//Awful to pronounce, but correct
assert_eq!(translate_word_with_style("nice", suffix, special_case_suffix), "icen".to_string() + suffix);
}
}
fn translate_word_with_style(english_word: &str, suffix_lower: &str, special_case_suffix_lower: &str) -> String {
let mut suffix_upper = String::new();
for letter in suffix_lower.chars() {
suffix_upper.push(letter.to_ascii_uppercase());
}
let mut special_case_suffix_upper = String::new();
for letter in special_case_suffix_lower.chars() {
special_case_suffix_upper.push(letter.to_ascii_uppercase());
}
let mut pig_latin_word = Vec::<u8>::new();
translate_word_with_style_reuse_buffers (
english_word.as_bytes(),
suffix_lower.as_bytes(), special_case_suffix_lower.as_bytes(), &suffix_upper.as_bytes(), &special_case_suffix_upper.as_bytes(),
&mut pig_latin_word
);
return std::str::from_utf8(pig_latin_word.as_slice()).unwrap().to_string();
}
#[test]
fn test_is_vowel() {
for letter in b"aeiouAEIOU".iter() {
assert!(is_vowel(*letter));
}
for letter in b"bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ".iter() {
assert!(!is_vowel(*letter));
}
for not_letter in b" !@#$%^&*()_+={}|\":>?~`\\][';/.,\t\n".iter() {
assert!(!is_vowel(*not_letter));
}
}
#[test]
fn test_is_y() {
for letter in b"yY".iter() {
assert!(is_y(*letter));
}
for letter in b"abcdefghijklmnopqrstuvwxzABCDEFGHIJKLMNOPQRSTUVWXZ".iter() {
assert!(!is_y(*letter));
}
for not_letter in b" !@#$%^&*()_+={}|\":>?~`\\][';/.,\t\n".iter() {
assert!(!is_y(*not_letter));
}
}
#[test]
fn test_word_is_uppercase() {
assert!(word_is_uppercase(b"HELLO"));
assert!(word_is_uppercase(b"WORLD"));
assert!(word_is_uppercase(b"I"));
assert!(!word_is_uppercase(b"would"));
assert!(!word_is_uppercase(b"like"));
assert!(!word_is_uppercase(b"a"));
assert!(!word_is_uppercase(b"pizza"));
assert!(!word_is_uppercase(b"Sussus"));
assert!(!word_is_uppercase(b"Amogus"));
}
#[test]
fn test_fast_is_ascii_uppercase() {
for letter in b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".iter() {
assert_eq!(fast_is_ascii_uppercase(*letter), letter.is_ascii_uppercase());
}
}
/*#[test]
fn test_fast_is_ascii_lowercase() {
for letter in b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".iter() {
assert_eq!(fast_is_ascii_lowercase(*letter), letter.is_ascii_lowercase());
}
}*/
#[test]
fn test_fast_to_ascii_uppercase() {
for letter in b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".iter() {
assert_eq!(fast_to_ascii_uppercase(*letter), letter.to_ascii_uppercase());
}
}
#[test]
fn test_fast_to_ascii_lowercase() {
for letter in b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".iter() {
assert_eq!(fast_to_ascii_lowercase(*letter), letter.to_ascii_lowercase());
}
}
}
/* Benches */
#[cfg_attr(feature = "nightly-features-benches", cfg(test))]
#[cfg(feature = "nightly-features-benches")]
mod benches {
extern crate test;
use test::Bencher;
use super::*;
const PROJECT_DESCRIPTION: &[u8] = b"A simple Rust library to translate from English to Pig Latin!";
const LOREM_IPSUM: &[u8] = b"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.";
#[bench]
fn way_the_word_translator(b: &mut Bencher) {
let mut pig_latin_word = Vec::<u8>::with_capacity(64 * 2);//Longer than all English words to avoid unneeded allocations, times 2 to leave room for whitespace, symbols, and the suffix
b.iter(|| {
let word = test::black_box(b"translator");
translate_word_with_style_reuse_buffers (
word,
b"ay", b"way", b"AY", b"WAY",
&mut pig_latin_word
);
pig_latin_word.truncate(0);
});
eprintln!("{}", std::str::from_utf8(pig_latin_word.as_slice()).unwrap());//To avoid optimizing things out
}
#[bench]
fn yay_the_word_translator(b: &mut Bencher) {
let mut pig_latin_word = Vec::<u8>::with_capacity(64 * 2);//Longer than all English words to avoid unneeded allocations, times 2 to leave room for whitespace, symbols, and the suffix
b.iter(|| {
let word = test::black_box(b"translator");
translate_word_with_style_reuse_buffers (
word,
b"ay", b"yay", b"AY", b"YAY",
&mut pig_latin_word
);
pig_latin_word.truncate(0);
});
eprintln!("{}", std::str::from_utf8(pig_latin_word.as_slice()).unwrap());//To avoid optimizing things out
}
#[bench]
fn hay_the_word_translator(b: &mut Bencher) {
let mut pig_latin_word = Vec::<u8>::with_capacity(64 * 2);//Longer than all English words to avoid unneeded allocations, times 2 to leave room for whitespace, symbols, and the suffix
b.iter(|| {
let word = test::black_box(b"translator");
translate_word_with_style_reuse_buffers (
word,
b"ay", b"hay", b"AY", b"HAY",
&mut pig_latin_word
);
pig_latin_word.truncate(0);
});
eprintln!("{}", std::str::from_utf8(pig_latin_word.as_slice()).unwrap());//To avoid optimizing things out
}
#[bench]
fn ferb_the_word_translator(b: &mut Bencher) {
let mut pig_latin_word = Vec::<u8>::with_capacity(64 * 2);//Longer than all English words to avoid unneeded allocations, times 2 to leave room for whitespace, symbols, and the suffix
b.iter(|| {
let word = test::black_box(b"translator");
translate_word_with_style_reuse_buffers (
word,
b"erb", b"ferb", b"ERB", b"FERB",
&mut pig_latin_word
);
pig_latin_word.truncate(0);
});
eprintln!("{}", std::str::from_utf8(pig_latin_word.as_slice()).unwrap());//To avoid optimizing things out
}
#[bench]
fn way_project_description(b: &mut Bencher) {
let mut pig_latin_word = Vec::<u8>::with_capacity(PROJECT_DESCRIPTION.len() * 2);
b.iter(|| {
let word = test::black_box(PROJECT_DESCRIPTION);
translate_way(word, &mut pig_latin_word);
pig_latin_word.truncate(0);
});
}
#[bench]
fn yay_project_description(b: &mut Bencher) {
let mut pig_latin_word = Vec::<u8>::with_capacity(PROJECT_DESCRIPTION.len() * 2);
b.iter(|| {
let word = test::black_box(PROJECT_DESCRIPTION);
translate_yay(word, &mut pig_latin_word);
pig_latin_word.truncate(0);
});
}
#[bench]
fn hay_project_description(b: &mut Bencher) {
let mut pig_latin_word = Vec::<u8>::with_capacity(PROJECT_DESCRIPTION.len() * 2);
b.iter(|| {
let word = test::black_box(PROJECT_DESCRIPTION);
translate_hay(word, &mut pig_latin_word);
pig_latin_word.truncate(0);
});
}
#[bench]
fn ferb_project_description(b: &mut Bencher) {
let mut pig_latin_word = Vec::<u8>::with_capacity(PROJECT_DESCRIPTION.len() * 2);
b.iter(|| {
let word = test::black_box(PROJECT_DESCRIPTION);
translate_ferb(word, &mut pig_latin_word);
pig_latin_word.truncate(0);
});
}
#[bench]
fn way_lorem_ipsum(b: &mut Bencher) {
let mut pig_latin_word = Vec::<u8>::with_capacity(LOREM_IPSUM.len() * 2);
b.iter(|| {
let word = test::black_box(LOREM_IPSUM);
translate_way(word, &mut pig_latin_word);
pig_latin_word.truncate(0);
});
}
#[bench]
fn yay_lorem_ipsum(b: &mut Bencher) {
let mut pig_latin_word = Vec::<u8>::with_capacity(LOREM_IPSUM.len() * 2);
b.iter(|| {
let word = test::black_box(LOREM_IPSUM);
translate_yay(word, &mut pig_latin_word);
pig_latin_word.truncate(0);
});
}
#[bench]
fn hay_lorem_ipsum(b: &mut Bencher) {
let mut pig_latin_word = Vec::<u8>::with_capacity(LOREM_IPSUM.len() * 2);
b.iter(|| {
let word = test::black_box(LOREM_IPSUM);
translate_hay(word, &mut pig_latin_word);
pig_latin_word.truncate(0);
});
}
#[bench]
fn ferb_lorem_ipsum(b: &mut Bencher) {
let mut pig_latin_word = Vec::<u8>::with_capacity(LOREM_IPSUM.len() * 2);
b.iter(|| {
let word = test::black_box(LOREM_IPSUM);
translate_ferb(word, &mut pig_latin_word);
pig_latin_word.truncate(0);
});
}
}