2020-09-02 16:47:03 +08:00
|
|
|
/* Determine display width of Unicode character.
|
2022-07-28 14:16:50 +08:00
|
|
|
Copyright (C) 2001-2002, 2006-2022 Free Software Foundation, Inc.
|
2020-09-02 16:47:03 +08:00
|
|
|
Written by Bruno Haible <bruno@clisp.org>, 2002.
|
|
|
|
|
2022-07-28 14:16:50 +08:00
|
|
|
This file is free software: you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU Lesser General Public License as
|
|
|
|
published by the Free Software Foundation; either version 2.1 of the
|
|
|
|
License, or (at your option) any later version.
|
2020-09-02 16:47:03 +08:00
|
|
|
|
2022-07-28 14:16:50 +08:00
|
|
|
This file is distributed in the hope that it will be useful,
|
2020-09-02 16:47:03 +08:00
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
2022-07-28 14:16:50 +08:00
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU Lesser General Public License for more details.
|
2020-09-02 16:47:03 +08:00
|
|
|
|
2022-07-28 14:16:50 +08:00
|
|
|
You should have received a copy of the GNU Lesser General Public License
|
2020-09-02 16:47:03 +08:00
|
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>. */
|
|
|
|
|
|
|
|
#include <config.h>
|
|
|
|
|
|
|
|
/* Specification. */
|
|
|
|
#include "uniwidth.h"
|
|
|
|
|
|
|
|
#include "cjk.h"
|
|
|
|
|
2022-07-28 14:16:50 +08:00
|
|
|
/* The non-spacing attribute table consists of:
|
|
|
|
* Non-spacing characters; generated from PropList.txt or
|
|
|
|
"grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
|
|
|
|
* Format control characters; generated from
|
|
|
|
"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
|
|
|
|
* Zero width characters; generated from
|
|
|
|
"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
|
|
|
|
* Hangul Jamo characters that have conjoining behaviour:
|
|
|
|
- jungseong = syllable-middle vowels
|
|
|
|
- jongseong = syllable-final consonants
|
|
|
|
Rationale:
|
|
|
|
1) These characters act like combining characters. They have no
|
|
|
|
equivalent in legacy character sets. Therefore the EastAsianWidth.txt
|
|
|
|
file does not really matter for them; UAX #11 East Asian Width
|
|
|
|
<https://www.unicode.org/reports/tr11/> makes it clear that it focus
|
|
|
|
is on compatibility with traditional Japanese layout.
|
|
|
|
By contrast, the same glyphs without conjoining behaviour are available
|
|
|
|
in the U+3130..U+318F block, and these characters are mapped to legacy
|
|
|
|
character sets, and traditional Japanese layout matters for them.
|
|
|
|
2) glibc does the same thing, see
|
|
|
|
<https://sourceware.org/bugzilla/show_bug.cgi?id=21750>
|
|
|
|
<https://sourceware.org/bugzilla/show_bug.cgi?id=26120>
|
2020-09-02 16:47:03 +08:00
|
|
|
*/
|
2022-07-28 14:16:50 +08:00
|
|
|
#include "uniwidth/width0.h"
|
|
|
|
|
|
|
|
#include "uniwidth/width2.h"
|
|
|
|
#include "unictype/bitmap.h"
|
|
|
|
|
|
|
|
#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
|
|
|
|
|
2020-09-02 16:47:03 +08:00
|
|
|
|
|
|
|
/* Determine number of column positions required for UC. */
|
|
|
|
int
|
|
|
|
uc_width (ucs4_t uc, const char *encoding)
|
|
|
|
{
|
|
|
|
/* Test for non-spacing or control character. */
|
2022-07-28 14:16:50 +08:00
|
|
|
if ((uc >> 9) < SIZEOF (nonspacing_table_ind))
|
2020-09-02 16:47:03 +08:00
|
|
|
{
|
|
|
|
int ind = nonspacing_table_ind[uc >> 9];
|
|
|
|
if (ind >= 0)
|
|
|
|
if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
|
|
|
|
{
|
|
|
|
if (uc > 0 && uc < 0xa0)
|
|
|
|
return -1;
|
|
|
|
else
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if ((uc >> 9) == (0xe0000 >> 9))
|
|
|
|
{
|
|
|
|
if (uc >= 0xe0100)
|
|
|
|
{
|
|
|
|
if (uc <= 0xe01ef)
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
2022-07-28 14:16:50 +08:00
|
|
|
/* Test for double-width character. */
|
|
|
|
if (bitmap_lookup (&u_width2, uc))
|
2020-09-02 16:47:03 +08:00
|
|
|
return 2;
|
|
|
|
/* In ancient CJK encodings, Cyrillic and most other characters are
|
|
|
|
double-width as well. */
|
|
|
|
if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
|
|
|
|
&& is_cjk_encoding (encoding))
|
|
|
|
return 2;
|
|
|
|
return 1;
|
|
|
|
}
|