319 lines
9.8 KiB
C
319 lines
9.8 KiB
C
/* Localization of proper names.
|
|
Copyright (C) 2006-2022 Free Software Foundation, Inc.
|
|
Written by Bruno Haible <bruno@clisp.org>, 2006.
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>. */
|
|
|
|
/* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that
|
|
the proper_name function might be candidate for attribute 'const' */
|
|
#if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__
|
|
# pragma GCC diagnostic ignored "-Wsuggest-attribute=const"
|
|
#endif
|
|
|
|
#include <config.h>
|
|
|
|
/* Specification. */
|
|
#include "propername.h"
|
|
|
|
#include <ctype.h>
|
|
#include <stdbool.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#if HAVE_ICONV
|
|
# include <iconv.h>
|
|
#endif
|
|
|
|
#include "trim.h"
|
|
#include "mbchar.h"
|
|
#include "mbuiter.h"
|
|
#include "localcharset.h"
|
|
#include "c-strcase.h"
|
|
#include "xstriconv.h"
|
|
#include "xalloc.h"
|
|
#include "gettext.h"
|
|
|
|
|
|
/* Tests whether STRING contains trim (SUB), starting and ending at word
|
|
boundaries.
|
|
Here, instead of implementing Unicode Standard Annex #29 for determining
|
|
word boundaries, we assume that trim (SUB) starts and ends with words and
|
|
only test whether the part before it ends with a non-word and the part
|
|
after it starts with a non-word. */
|
|
static bool
|
|
mbsstr_trimmed_wordbounded (const char *string, const char *sub)
|
|
{
|
|
char *tsub = trim (sub);
|
|
bool found = false;
|
|
|
|
for (; *string != '\0';)
|
|
{
|
|
const char *tsub_in_string = mbsstr (string, tsub);
|
|
if (tsub_in_string == NULL)
|
|
break;
|
|
else
|
|
{
|
|
if (MB_CUR_MAX > 1)
|
|
{
|
|
mbui_iterator_t string_iter;
|
|
bool word_boundary_before;
|
|
bool word_boundary_after;
|
|
|
|
mbui_init (string_iter, string);
|
|
word_boundary_before = true;
|
|
if (mbui_cur_ptr (string_iter) < tsub_in_string)
|
|
{
|
|
mbchar_t last_char_before_tsub;
|
|
do
|
|
{
|
|
if (!mbui_avail (string_iter))
|
|
abort ();
|
|
last_char_before_tsub = mbui_cur (string_iter);
|
|
mbui_advance (string_iter);
|
|
}
|
|
while (mbui_cur_ptr (string_iter) < tsub_in_string);
|
|
if (mb_isalnum (last_char_before_tsub))
|
|
word_boundary_before = false;
|
|
}
|
|
|
|
mbui_init (string_iter, tsub_in_string);
|
|
{
|
|
mbui_iterator_t tsub_iter;
|
|
|
|
for (mbui_init (tsub_iter, tsub);
|
|
mbui_avail (tsub_iter);
|
|
mbui_advance (tsub_iter))
|
|
{
|
|
if (!mbui_avail (string_iter))
|
|
abort ();
|
|
mbui_advance (string_iter);
|
|
}
|
|
}
|
|
word_boundary_after = true;
|
|
if (mbui_avail (string_iter))
|
|
{
|
|
mbchar_t first_char_after_tsub = mbui_cur (string_iter);
|
|
if (mb_isalnum (first_char_after_tsub))
|
|
word_boundary_after = false;
|
|
}
|
|
|
|
if (word_boundary_before && word_boundary_after)
|
|
{
|
|
found = true;
|
|
break;
|
|
}
|
|
|
|
mbui_init (string_iter, tsub_in_string);
|
|
if (!mbui_avail (string_iter))
|
|
break;
|
|
string = tsub_in_string + mb_len (mbui_cur (string_iter));
|
|
}
|
|
else
|
|
{
|
|
bool word_boundary_before;
|
|
const char *p;
|
|
bool word_boundary_after;
|
|
|
|
word_boundary_before = true;
|
|
if (string < tsub_in_string)
|
|
if (isalnum ((unsigned char) tsub_in_string[-1]))
|
|
word_boundary_before = false;
|
|
|
|
p = tsub_in_string + strlen (tsub);
|
|
word_boundary_after = true;
|
|
if (*p != '\0')
|
|
if (isalnum ((unsigned char) *p))
|
|
word_boundary_after = false;
|
|
|
|
if (word_boundary_before && word_boundary_after)
|
|
{
|
|
found = true;
|
|
break;
|
|
}
|
|
|
|
if (*tsub_in_string == '\0')
|
|
break;
|
|
string = tsub_in_string + 1;
|
|
}
|
|
}
|
|
}
|
|
free (tsub);
|
|
return found;
|
|
}
|
|
|
|
/* Return the localization of NAME. NAME is written in ASCII. */
|
|
|
|
const char *
|
|
proper_name (const char *name)
|
|
{
|
|
/* See whether there is a translation. */
|
|
const char *translation = gettext (name);
|
|
|
|
if (translation != name)
|
|
{
|
|
/* See whether the translation contains the original name. */
|
|
if (mbsstr_trimmed_wordbounded (translation, name))
|
|
return translation;
|
|
else
|
|
{
|
|
/* Return "TRANSLATION (NAME)". */
|
|
char *result =
|
|
XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
|
|
|
|
sprintf (result, "%s (%s)", translation, name);
|
|
return result;
|
|
}
|
|
}
|
|
else
|
|
return name;
|
|
}
|
|
|
|
/* Return the localization of a name whose original writing is not ASCII.
|
|
NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal
|
|
escape sequences. NAME_ASCII is a fallback written only with ASCII
|
|
characters. */
|
|
|
|
const char *
|
|
proper_name_utf8 (const char *name_ascii, const char *name_utf8)
|
|
{
|
|
/* See whether there is a translation. */
|
|
const char *translation = gettext (name_ascii);
|
|
|
|
/* Try to convert NAME_UTF8 to the locale encoding. */
|
|
const char *locale_code = locale_charset ();
|
|
char *alloc_name_converted = NULL;
|
|
char *alloc_name_converted_translit = NULL;
|
|
const char *name_converted = NULL;
|
|
const char *name_converted_translit = NULL;
|
|
const char *name;
|
|
|
|
if (c_strcasecmp (locale_code, "UTF-8") != 0)
|
|
{
|
|
#if HAVE_ICONV
|
|
name_converted = alloc_name_converted =
|
|
xstr_iconv (name_utf8, "UTF-8", locale_code);
|
|
|
|
# if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
|
|
&& !defined __UCLIBC__) \
|
|
|| _LIBICONV_VERSION >= 0x0105
|
|
{
|
|
char *converted_translit;
|
|
|
|
size_t len = strlen (locale_code);
|
|
char *locale_code_translit = XNMALLOC (len + 10 + 1, char);
|
|
memcpy (locale_code_translit, locale_code, len);
|
|
memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1);
|
|
|
|
converted_translit =
|
|
xstr_iconv (name_utf8, "UTF-8", locale_code_translit);
|
|
|
|
free (locale_code_translit);
|
|
|
|
if (converted_translit != NULL)
|
|
{
|
|
# if !_LIBICONV_VERSION
|
|
/* Don't use the transliteration if it added question marks.
|
|
glibc's transliteration falls back to question marks; libiconv's
|
|
transliteration does not.
|
|
mbschr is equivalent to strchr in this case. */
|
|
if (strchr (converted_translit, '?') != NULL)
|
|
free (converted_translit);
|
|
else
|
|
# endif
|
|
name_converted_translit = alloc_name_converted_translit =
|
|
converted_translit;
|
|
}
|
|
}
|
|
# endif
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
name_converted = name_utf8;
|
|
name_converted_translit = name_utf8;
|
|
}
|
|
|
|
/* The name in locale encoding. */
|
|
name = (name_converted != NULL ? name_converted :
|
|
name_converted_translit != NULL ? name_converted_translit :
|
|
name_ascii);
|
|
|
|
/* See whether we have a translation. Some translators have not understood
|
|
that they should use the UTF-8 form of the name, if possible. So if the
|
|
translator provided a no-op translation, we ignore it. */
|
|
if (strcmp (translation, name_ascii) != 0)
|
|
{
|
|
/* See whether the translation contains the original name. */
|
|
if (mbsstr_trimmed_wordbounded (translation, name_ascii)
|
|
|| (name_converted != NULL
|
|
&& mbsstr_trimmed_wordbounded (translation, name_converted))
|
|
|| (name_converted_translit != NULL
|
|
&& mbsstr_trimmed_wordbounded (translation, name_converted_translit)))
|
|
{
|
|
if (alloc_name_converted != NULL)
|
|
free (alloc_name_converted);
|
|
if (alloc_name_converted_translit != NULL)
|
|
free (alloc_name_converted_translit);
|
|
return translation;
|
|
}
|
|
else
|
|
{
|
|
/* Return "TRANSLATION (NAME)". */
|
|
char *result =
|
|
XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
|
|
|
|
sprintf (result, "%s (%s)", translation, name);
|
|
|
|
if (alloc_name_converted != NULL)
|
|
free (alloc_name_converted);
|
|
if (alloc_name_converted_translit != NULL)
|
|
free (alloc_name_converted_translit);
|
|
return result;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (alloc_name_converted != NULL && alloc_name_converted != name)
|
|
free (alloc_name_converted);
|
|
if (alloc_name_converted_translit != NULL
|
|
&& alloc_name_converted_translit != name)
|
|
free (alloc_name_converted_translit);
|
|
return name;
|
|
}
|
|
}
|
|
|
|
#ifdef TEST1
|
|
# include <locale.h>
|
|
int
|
|
main (int argc, char *argv[])
|
|
{
|
|
setlocale (LC_ALL, "");
|
|
if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))
|
|
printf("found\n");
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
#ifdef TEST2
|
|
# include <locale.h>
|
|
# include <stdio.h>
|
|
int
|
|
main (int argc, char *argv[])
|
|
{
|
|
setlocale (LC_ALL, "");
|
|
printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard"));
|
|
return 0;
|
|
}
|
|
#endif
|