1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
/// Compute the display width of `text`
///
/// # Examples
///
/// **Note:** When the `unicode` Cargo feature is disabled, all characters are presumed to take up
/// 1 width. With the feature enabled, function will correctly deal with [combining characters] in
/// their decomposed form (see [Unicode equivalence]).
///
/// An example of a decomposed character is “é”, which can be decomposed into: “e” followed by a
/// combining acute accent: “◌́”. Without the `unicode` Cargo feature, every `char` has a width of
/// 1. This includes the combining accent:
///
/// ## Emojis and CJK Characters
///
/// Characters such as emojis and [CJK characters] used in the
/// Chinese, Japanese, and Korean languages are seen as double-width,
/// even if the `unicode-width` feature is disabled:
///
/// # Limitations
///
/// The displayed width of a string cannot always be computed from the
/// string alone. This is because the width depends on the rendering
/// engine used. This is particularly visible with [emoji modifier
/// sequences] where a base emoji is modified with, e.g., skin tone or
/// hair color modifiers. It is up to the rendering engine to detect
/// this and to produce a suitable emoji.
///
/// A simple example is “❤️”, which consists of “❤” (U+2764: Black
/// Heart Symbol) followed by U+FE0F (Variation Selector-16). By
/// itself, “❤” is a black heart, but if you follow it with the
/// variant selector, you may get a wider red heart.
///
/// A more complex example would be “👨🦰” which should depict a man
/// with red hair. Here the computed width is too large — and the
/// width differs depending on the use of the `unicode-width` feature:
///
/// This happens because the grapheme consists of three code points:
/// “👨” (U+1F468: Man), Zero Width Joiner (U+200D), and “🦰”
/// (U+1F9B0: Red Hair). You can see them above in the test. With
/// `unicode-width` enabled, the ZWJ is correctly seen as having zero
/// width, without it is counted as a double-width character.
///
/// ## Terminal Support
///
/// Modern browsers typically do a great job at combining characters
/// as shown above, but terminals often struggle more. As an example,
/// Gnome Terminal version 3.38.1, shows “❤️” as a big red heart, but
/// shows "👨🦰" as “👨🦰”.
///
/// [combining characters]: https://en.wikipedia.org/wiki/Combining_character
/// [Unicode equivalence]: https://en.wikipedia.org/wiki/Unicode_equivalence
/// [CJK characters]: https://en.wikipedia.org/wiki/CJK_characters
/// [emoji modifier sequences]: https://unicode.org/emoji/charts/full-emoji-modifiers.html
#[inline(never)]
pub(crate) fn display_width(text: &str) -> usize {
let mut width = 0;
let mut control_sequence = false;
let control_terminate: char = 'm';
for ch in text.chars() {
if ch.is_ascii_control() {
control_sequence = true;
} else if control_sequence && ch == control_terminate {
control_sequence = false;
continue;
}
if !control_sequence {
width += ch_width(ch);
}
}
width
}
#[cfg(feature = "unicode")]
fn ch_width(ch: char) -> usize {
unicode_width::UnicodeWidthChar::width(ch).unwrap_or(0)
}
#[cfg(not(feature = "unicode"))]
fn ch_width(_: char) -> usize {
1
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(feature = "unicode")]
use unicode_width::UnicodeWidthChar;
#[test]
fn emojis_have_correct_width() {
use unic_emoji_char::is_emoji;
// Emojis in the Basic Latin (ASCII) and Latin-1 Supplement
// blocks all have a width of 1 column. This includes
// characters such as '#' and '©'.
for ch in '\u{1}'..'\u{FF}' {
if is_emoji(ch) {
let desc = format!("{:?} U+{:04X}", ch, ch as u32);
#[cfg(feature = "unicode")]
assert_eq!(ch.width().unwrap(), 1, "char: {desc}");
#[cfg(not(feature = "unicode"))]
assert_eq!(ch_width(ch), 1, "char: {desc}");
}
}
// Emojis in the remaining blocks of the Basic Multilingual
// Plane (BMP), in the Supplementary Multilingual Plane (SMP),
// and in the Supplementary Ideographic Plane (SIP), are all 1
// or 2 columns wide when unicode-width is used, and always 2
// columns wide otherwise. This includes all of our favorite
// emojis such as 😊.
for ch in '\u{FF}'..'\u{2FFFF}' {
if is_emoji(ch) {
let desc = format!("{:?} U+{:04X}", ch, ch as u32);
#[cfg(feature = "unicode")]
assert!(ch.width().unwrap() <= 2, "char: {desc}");
#[cfg(not(feature = "unicode"))]
assert_eq!(ch_width(ch), 1, "char: {desc}");
}
}
// The remaining planes contain almost no assigned code points
// and thus also no emojis.
}
#[test]
#[cfg(feature = "unicode")]
fn display_width_works() {
assert_eq!("Café Plain".len(), 11); // “é” is two bytes
assert_eq!(display_width("Café Plain"), 10);
}
#[test]
#[cfg(feature = "unicode")]
fn display_width_narrow_emojis() {
assert_eq!(display_width("⁉"), 1);
}
#[test]
#[cfg(feature = "unicode")]
fn display_width_narrow_emojis_variant_selector() {
assert_eq!(display_width("⁉\u{fe0f}"), 1);
}
#[test]
#[cfg(feature = "unicode")]
fn display_width_emojis() {
assert_eq!(display_width("😂😭🥺🤣✨😍🙏🥰😊🔥"), 20);
}
}