wormhole/sui/token_bridge/sources/utils/string_utils.move

module token_bridge::string_utils {
    use std::ascii::{Self};
    use std::string::{Self, String};
    use std::vector::{Self};

    const QUESTION_MARK: u8 = 63;
    // Recall that UTF-8 characters have variable-length encoding and can have
    // 1, 2, 3, or 4 bytes.
    // The first byte of the 2, 3, and 4-byte UTF-8 characters have a special
    // form indicating how many more bytes follow in the same character
    // representation. Specifically, it can have the forms
    //  - 110xxxxx // 11000000 is 192 (base 10)
    //  - 1110xxxx // 11100000 is 224 (base 10)
    //  - or 11110xxx // 11110000 is 240 (base 10)
    //
    // We can tell the length the a hex UTF-8 character in bytes by looking
    // at the first byte and counting the leading 1's, or alternatively
    // seeing whether it falls in the range
    // [11000000, 11100000) or [11100000, 11110000) or [11110000, 11111111],
    //
    // The following constants demarcate those ranges and are used in the
    // string32::to_ascii function.
    const UTF8_LENGTH_2_FIRST_BYTE_LOWER_BOUND: u8 = 192;
    const UTF8_LENGTH_3_FIRST_BYTE_LOWER_BOUND: u8 = 224;
    const UTF8_LENGTH_4_FIRST_BYTE_LOWER_BOUND: u8 = 240;

    /// Converts a String32 to an ascii string if possible, otherwise errors
    /// out at `ascii::string(bytes)`. For input strings that contain non-ascii
    /// characters, we will swap the non-ascii character with `?`.
    ///
    /// Note that while the Sui spec limits symbols to only use ascii
    /// characters, the token bridge spec does allow utf8 symbols.
    public fun to_ascii(s: &String): ascii::String {
        let buf = *string::bytes(s);
        // keep dropping the last character while it's 0
        while (
            !vector::is_empty(&buf) &&
            *vector::borrow(&buf, vector::length(&buf) - 1) == 0
        ) {
            vector::pop_back(&mut buf);
        };

        // Run through `buf` to convert any non-ascii character to `?`.
        let asciified = vector::empty();
        let (i, n) = (0, vector::length(&buf));
        while (i < n) {
            let b = *vector::borrow(&buf, i);
            // If it is a valid ascii character, keep it.
            if (ascii::is_valid_char(b)) {
                vector::push_back(&mut asciified, b);
                i = i + 1;
            } else {
                // Since UTF-8 characters have variable-length encoding (they are
                // represented using 1-4 bytes, unlike ASCII characters, which
                // are represented using 1 byte), we don't want to transform
                // every byte in a UTF-8 string that does not represent an ASCII
                // character to the question mark symbol "?". This would result
                // in having too many "?" symbols.
                //
                // Instead, we want a single "?" for each character. Note that
                // the 1-byte UTF-8 characters correspond to valid ASCII
                // characters and have the form 0xxxxxxx.
                // The 2, 3, and 4-byte UTF-8 characters have first byte equal
                // to:
                //  - 110xxxxx // 192
                //  - 1110xxxx // 224
                //  - or 11110xxx // 240
                //
                // and remaining bytes of the form:
                // - 10xxxxxx
                //
                // To ensure a one-to-one mapping of a multi-byte UTF-8 character
                // to a "?", we detect the first byte of a new UTF-8 character
                // in a multi-byte representation by checking if it is
                // >= 11000000 (base 2) or 192 (base 10) and convert it to a "?"
                // and skip the remaining bytes in the same representation.
                //
                //
                // Reference: https://en.wikipedia.org/wiki/UTF-8
                if (b >= UTF8_LENGTH_2_FIRST_BYTE_LOWER_BOUND){
                    vector::push_back(&mut asciified, QUESTION_MARK);
                    if (b >= UTF8_LENGTH_4_FIRST_BYTE_LOWER_BOUND){
                        // The UTF-8 char has a 4-byte hex representation.
                        i = i + 4;
                    } else if (b >= UTF8_LENGTH_3_FIRST_BYTE_LOWER_BOUND){
                        // The UTF-8 char has a 3-byte hex representation.
                        i = i + 3;
                    } else {
                        // The UTF-8 char has a 2-byte hex representation.
                        i = i + 2;
                    }
                }
            };
        };
        ascii::string(asciified)
    }
}
sui: redesign Wormhole and Token Bridge contracts 2023-05-02 09:22:30 -07:00			`module token_bridge::string_utils {`
			`use std::ascii::{Self};`
			`use std::string::{Self, String};`
			`use std::vector::{Self};`

			`const QUESTION_MARK: u8 = 63;`
			`// Recall that UTF-8 characters have variable-length encoding and can have`
			`// 1, 2, 3, or 4 bytes.`
			`// The first byte of the 2, 3, and 4-byte UTF-8 characters have a special`
			`// form indicating how many more bytes follow in the same character`
			`// representation. Specifically, it can have the forms`
			`// - 110xxxxx // 11000000 is 192 (base 10)`
			`// - 1110xxxx // 11100000 is 224 (base 10)`
			`// - or 11110xxx // 11110000 is 240 (base 10)`
			`//`
			`// We can tell the length the a hex UTF-8 character in bytes by looking`
			`// at the first byte and counting the leading 1's, or alternatively`
			`// seeing whether it falls in the range`
			`// [11000000, 11100000) or [11100000, 11110000) or [11110000, 11111111],`
			`//`
			`// The following constants demarcate those ranges and are used in the`
			`// string32::to_ascii function.`
			`const UTF8_LENGTH_2_FIRST_BYTE_LOWER_BOUND: u8 = 192;`
			`const UTF8_LENGTH_3_FIRST_BYTE_LOWER_BOUND: u8 = 224;`
			`const UTF8_LENGTH_4_FIRST_BYTE_LOWER_BOUND: u8 = 240;`

			`/// Converts a String32 to an ascii string if possible, otherwise errors`
			/// out at `ascii::string(bytes)`. For input strings that contain non-ascii
			/// characters, we will swap the non-ascii character with `?`.
			`///`
			`/// Note that while the Sui spec limits symbols to only use ascii`
			`/// characters, the token bridge spec does allow utf8 symbols.`
			`public fun to_ascii(s: &String): ascii::String {`
			`let buf = *string::bytes(s);`
			`// keep dropping the last character while it's 0`
			`while (`
			`!vector::is_empty(&buf) &&`
			`*vector::borrow(&buf, vector::length(&buf) - 1) == 0`
			`) {`
			`vector::pop_back(&mut buf);`
			`};`

			// Run through `buf` to convert any non-ascii character to `?`.
			`let asciified = vector::empty();`
			`let (i, n) = (0, vector::length(&buf));`
			`while (i < n) {`
			`let b = *vector::borrow(&buf, i);`
			`// If it is a valid ascii character, keep it.`
			`if (ascii::is_valid_char(b)) {`
			`vector::push_back(&mut asciified, b);`
			`i = i + 1;`
			`} else {`
			`// Since UTF-8 characters have variable-length encoding (they are`
			`// represented using 1-4 bytes, unlike ASCII characters, which`
			`// are represented using 1 byte), we don't want to transform`
			`// every byte in a UTF-8 string that does not represent an ASCII`
			`// character to the question mark symbol "?". This would result`
			`// in having too many "?" symbols.`
			`//`
			`// Instead, we want a single "?" for each character. Note that`
			`// the 1-byte UTF-8 characters correspond to valid ASCII`
			`// characters and have the form 0xxxxxxx.`
			`// The 2, 3, and 4-byte UTF-8 characters have first byte equal`
			`// to:`
			`// - 110xxxxx // 192`
			`// - 1110xxxx // 224`
			`// - or 11110xxx // 240`
			`//`
			`// and remaining bytes of the form:`
			`// - 10xxxxxx`
			`//`
			`// To ensure a one-to-one mapping of a multi-byte UTF-8 character`
			`// to a "?", we detect the first byte of a new UTF-8 character`
			`// in a multi-byte representation by checking if it is`
			`// >= 11000000 (base 2) or 192 (base 10) and convert it to a "?"`
			`// and skip the remaining bytes in the same representation.`
			`//`
			`//`
			`// Reference: https://en.wikipedia.org/wiki/UTF-8`
			`if (b >= UTF8_LENGTH_2_FIRST_BYTE_LOWER_BOUND){`
			`vector::push_back(&mut asciified, QUESTION_MARK);`
			`if (b >= UTF8_LENGTH_4_FIRST_BYTE_LOWER_BOUND){`
			`// The UTF-8 char has a 4-byte hex representation.`
			`i = i + 4;`
			`} else if (b >= UTF8_LENGTH_3_FIRST_BYTE_LOWER_BOUND){`
			`// The UTF-8 char has a 3-byte hex representation.`
			`i = i + 3;`
			`} else {`
			`// The UTF-8 char has a 2-byte hex representation.`
			`i = i + 2;`
			`}`
			`}`
			`};`
			`};`
			`ascii::string(asciified)`
			`}`
			`}`