1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
/// Compute the display width of `text`
///
/// # Examples
///
/// **Note:** When the `unicode` Cargo feature is disabled, all characters are presumed to take up
/// 1 width.  With the feature enabled, function will correctly deal with [combining characters] in
/// their decomposed form (see [Unicode equivalence]).
///
/// An example of a decomposed character is “é”, which can be decomposed into: “e” followed by a
/// combining acute accent: “◌́”.  Without the `unicode` Cargo feature, every `char` has a width of
/// 1. This includes the combining accent:
///
/// ## Emojis and CJK Characters
///
/// Characters such as emojis and [CJK characters] used in the
/// Chinese, Japanese, and Korean languages are seen as double-width,
/// even if the `unicode-width` feature is disabled:
///
/// # Limitations
///
/// The displayed width of a string cannot always be computed from the
/// string alone. This is because the width depends on the rendering
/// engine used. This is particularly visible with [emoji modifier
/// sequences] where a base emoji is modified with, e.g., skin tone or
/// hair color modifiers. It is up to the rendering engine to detect
/// this and to produce a suitable emoji.
///
/// A simple example is “❤️”, which consists of “❤” (U+2764: Black
/// Heart Symbol) followed by U+FE0F (Variation Selector-16). By
/// itself, “❤” is a black heart, but if you follow it with the
/// variant selector, you may get a wider red heart.
///
/// A more complex example would be “👨‍🦰” which should depict a man
/// with red hair. Here the computed width is too large — and the
/// width differs depending on the use of the `unicode-width` feature:
///
/// This happens because the grapheme consists of three code points:
/// “👨” (U+1F468: Man), Zero Width Joiner (U+200D), and “🦰”
/// (U+1F9B0: Red Hair). You can see them above in the test. With
/// `unicode-width` enabled, the ZWJ is correctly seen as having zero
/// width, without it is counted as a double-width character.
///
/// ## Terminal Support
///
/// Modern browsers typically do a great job at combining characters
/// as shown above, but terminals often struggle more. As an example,
/// Gnome Terminal version 3.38.1, shows “❤️” as a big red heart, but
/// shows "👨‍🦰" as “👨🦰”.
///
/// [combining characters]: https://en.wikipedia.org/wiki/Combining_character
/// [Unicode equivalence]: https://en.wikipedia.org/wiki/Unicode_equivalence
/// [CJK characters]: https://en.wikipedia.org/wiki/CJK_characters
/// [emoji modifier sequences]: https://unicode.org/emoji/charts/full-emoji-modifiers.html
#[inline(never)]
pub(crate) fn display_width(text: &str) -> usize {
    let mut width = 0;

    let mut control_sequence = false;
    let control_terminate: char = 'm';

    for ch in text.chars() {
        if ch.is_ascii_control() {
            control_sequence = true;
        } else if control_sequence && ch == control_terminate {
            control_sequence = false;
            continue;
        }

        if !control_sequence {
            width += ch_width(ch);
        }
    }
    width
}

#[cfg(feature = "unicode")]
fn ch_width(ch: char) -> usize {
    unicode_width::UnicodeWidthChar::width(ch).unwrap_or(0)
}

#[cfg(not(feature = "unicode"))]
fn ch_width(_: char) -> usize {
    1
}

#[cfg(test)]
mod tests {
    use super::*;

    #[cfg(feature = "unicode")]
    use unicode_width::UnicodeWidthChar;

    #[test]
    fn emojis_have_correct_width() {
        use unic_emoji_char::is_emoji;

        // Emojis in the Basic Latin (ASCII) and Latin-1 Supplement
        // blocks all have a width of 1 column. This includes
        // characters such as '#' and '©'.
        for ch in '\u{1}'..'\u{FF}' {
            if is_emoji(ch) {
                let desc = format!("{:?} U+{:04X}", ch, ch as u32);

                #[cfg(feature = "unicode")]
                assert_eq!(ch.width().unwrap(), 1, "char: {desc}");

                #[cfg(not(feature = "unicode"))]
                assert_eq!(ch_width(ch), 1, "char: {desc}");
            }
        }

        // Emojis in the remaining blocks of the Basic Multilingual
        // Plane (BMP), in the Supplementary Multilingual Plane (SMP),
        // and in the Supplementary Ideographic Plane (SIP), are all 1
        // or 2 columns wide when unicode-width is used, and always 2
        // columns wide otherwise. This includes all of our favorite
        // emojis such as 😊.
        for ch in '\u{FF}'..'\u{2FFFF}' {
            if is_emoji(ch) {
                let desc = format!("{:?} U+{:04X}", ch, ch as u32);

                #[cfg(feature = "unicode")]
                assert!(ch.width().unwrap() <= 2, "char: {desc}");

                #[cfg(not(feature = "unicode"))]
                assert_eq!(ch_width(ch), 1, "char: {desc}");
            }
        }

        // The remaining planes contain almost no assigned code points
        // and thus also no emojis.
    }

    #[test]
    #[cfg(feature = "unicode")]
    fn display_width_works() {
        assert_eq!("Café Plain".len(), 11); // “é” is two bytes
        assert_eq!(display_width("Café Plain"), 10);
    }

    #[test]
    #[cfg(feature = "unicode")]
    fn display_width_narrow_emojis() {
        assert_eq!(display_width("⁉"), 1);
    }

    #[test]
    #[cfg(feature = "unicode")]
    fn display_width_narrow_emojis_variant_selector() {
        assert_eq!(display_width("⁉\u{fe0f}"), 1);
    }

    #[test]
    #[cfg(feature = "unicode")]
    fn display_width_emojis() {
        assert_eq!(display_width("😂😭🥺🤣✨😍🙏🥰😊🔥"), 20);
    }
}