rm_/lib/propername.c

/* Localization of proper names.
   Copyright (C) 2006-2020 Free Software Foundation, Inc.
   Written by Bruno Haible <bruno@clisp.org>, 2006.

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */

/* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that
   the proper_name function might be candidate for attribute 'const'  */
#if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__
# pragma GCC diagnostic ignored "-Wsuggest-attribute=const"
#endif

#include <config.h>

/* Specification.  */
#include "propername.h"

#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if HAVE_ICONV
# include <iconv.h>
#endif

#include "trim.h"
#include "mbchar.h"
#include "mbuiter.h"
#include "localcharset.h"
#include "c-strcase.h"
#include "xstriconv.h"
#include "xalloc.h"
#include "gettext.h"


/* Tests whether STRING contains trim (SUB), starting and ending at word
   boundaries.
   Here, instead of implementing Unicode Standard Annex #29 for determining
   word boundaries, we assume that trim (SUB) starts and ends with words and
   only test whether the part before it ends with a non-word and the part
   after it starts with a non-word.  */
static bool
mbsstr_trimmed_wordbounded (const char *string, const char *sub)
{
  char *tsub = trim (sub);
  bool found = false;

  for (; *string != '\0';)
    {
      const char *tsub_in_string = mbsstr (string, tsub);
      if (tsub_in_string == NULL)
        break;
      else
        {
          if (MB_CUR_MAX > 1)
            {
              mbui_iterator_t string_iter;
              bool word_boundary_before;
              bool word_boundary_after;

              mbui_init (string_iter, string);
              word_boundary_before = true;
              if (mbui_cur_ptr (string_iter) < tsub_in_string)
                {
                  mbchar_t last_char_before_tsub;
                  do
                    {
                      if (!mbui_avail (string_iter))
                        abort ();
                      last_char_before_tsub = mbui_cur (string_iter);
                      mbui_advance (string_iter);
                    }
                  while (mbui_cur_ptr (string_iter) < tsub_in_string);
                  if (mb_isalnum (last_char_before_tsub))
                    word_boundary_before = false;
                }

              mbui_init (string_iter, tsub_in_string);
              {
                mbui_iterator_t tsub_iter;

                for (mbui_init (tsub_iter, tsub);
                     mbui_avail (tsub_iter);
                     mbui_advance (tsub_iter))
                  {
                    if (!mbui_avail (string_iter))
                      abort ();
                    mbui_advance (string_iter);
                  }
              }
              word_boundary_after = true;
              if (mbui_avail (string_iter))
                {
                  mbchar_t first_char_after_tsub = mbui_cur (string_iter);
                  if (mb_isalnum (first_char_after_tsub))
                    word_boundary_after = false;
                }

              if (word_boundary_before && word_boundary_after)
                {
                  found = true;
                  break;
                }

              mbui_init (string_iter, tsub_in_string);
              if (!mbui_avail (string_iter))
                break;
              string = tsub_in_string + mb_len (mbui_cur (string_iter));
            }
          else
            {
              bool word_boundary_before;
              const char *p;
              bool word_boundary_after;

              word_boundary_before = true;
              if (string < tsub_in_string)
                if (isalnum ((unsigned char) tsub_in_string[-1]))
                  word_boundary_before = false;

              p = tsub_in_string + strlen (tsub);
              word_boundary_after = true;
              if (*p != '\0')
                if (isalnum ((unsigned char) *p))
                  word_boundary_after = false;

              if (word_boundary_before && word_boundary_after)
                {
                  found = true;
                  break;
                }

              if (*tsub_in_string == '\0')
                break;
              string = tsub_in_string + 1;
            }
        }
    }
  free (tsub);
  return found;
}

/* Return the localization of NAME.  NAME is written in ASCII.  */

const char *
proper_name (const char *name)
{
  /* See whether there is a translation.   */
  const char *translation = gettext (name);

  if (translation != name)
    {
      /* See whether the translation contains the original name.  */
      if (mbsstr_trimmed_wordbounded (translation, name))
        return translation;
      else
        {
          /* Return "TRANSLATION (NAME)".  */
          char *result =
            XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);

          sprintf (result, "%s (%s)", translation, name);
          return result;
        }
    }
  else
    return name;
}

/* Return the localization of a name whose original writing is not ASCII.
   NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal
   escape sequences.  NAME_ASCII is a fallback written only with ASCII
   characters.  */

const char *
proper_name_utf8 (const char *name_ascii, const char *name_utf8)
{
  /* See whether there is a translation.   */
  const char *translation = gettext (name_ascii);

  /* Try to convert NAME_UTF8 to the locale encoding.  */
  const char *locale_code = locale_charset ();
  char *alloc_name_converted = NULL;
  char *alloc_name_converted_translit = NULL;
  const char *name_converted = NULL;
  const char *name_converted_translit = NULL;
  const char *name;

  if (c_strcasecmp (locale_code, "UTF-8") != 0)
    {
#if HAVE_ICONV
      name_converted = alloc_name_converted =
        xstr_iconv (name_utf8, "UTF-8", locale_code);

# if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
      && !defined __UCLIBC__) \
     || _LIBICONV_VERSION >= 0x0105
      {
        char *converted_translit;

        size_t len = strlen (locale_code);
        char *locale_code_translit = XNMALLOC (len + 10 + 1, char);
        memcpy (locale_code_translit, locale_code, len);
        memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1);

        converted_translit =
          xstr_iconv (name_utf8, "UTF-8", locale_code_translit);

        free (locale_code_translit);

        if (converted_translit != NULL)
          {
#  if !_LIBICONV_VERSION
            /* Don't use the transliteration if it added question marks.
               glibc's transliteration falls back to question marks; libiconv's
               transliteration does not.
               mbschr is equivalent to strchr in this case.  */
            if (strchr (converted_translit, '?') != NULL)
              free (converted_translit);
            else
#  endif
              name_converted_translit = alloc_name_converted_translit =
                converted_translit;
          }
      }
# endif
#endif
    }
  else
    {
      name_converted = name_utf8;
      name_converted_translit = name_utf8;
    }

  /* The name in locale encoding.  */
  name = (name_converted != NULL ? name_converted :
          name_converted_translit != NULL ? name_converted_translit :
          name_ascii);

  /* See whether we have a translation.  Some translators have not understood
     that they should use the UTF-8 form of the name, if possible.  So if the
     translator provided a no-op translation, we ignore it.  */
  if (strcmp (translation, name_ascii) != 0)
    {
      /* See whether the translation contains the original name.  */
      if (mbsstr_trimmed_wordbounded (translation, name_ascii)
          || (name_converted != NULL
              && mbsstr_trimmed_wordbounded (translation, name_converted))
          || (name_converted_translit != NULL
              && mbsstr_trimmed_wordbounded (translation, name_converted_translit)))
        {
          if (alloc_name_converted != NULL)
            free (alloc_name_converted);
          if (alloc_name_converted_translit != NULL)
            free (alloc_name_converted_translit);
          return translation;
        }
      else
        {
          /* Return "TRANSLATION (NAME)".  */
          char *result =
            XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);

          sprintf (result, "%s (%s)", translation, name);

          if (alloc_name_converted != NULL)
            free (alloc_name_converted);
          if (alloc_name_converted_translit != NULL)
            free (alloc_name_converted_translit);
          return result;
        }
    }
  else
    {
      if (alloc_name_converted != NULL && alloc_name_converted != name)
        free (alloc_name_converted);
      if (alloc_name_converted_translit != NULL
          && alloc_name_converted_translit != name)
        free (alloc_name_converted_translit);
      return name;
    }
}

#ifdef TEST1
# include <locale.h>
int
main (int argc, char *argv[])
{
  setlocale (LC_ALL, "");
  if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))
    printf("found\n");
  return 0;
}
#endif

#ifdef TEST2
# include <locale.h>
# include <stdio.h>
int
main (int argc, char *argv[])
{
  setlocale (LC_ALL, "");
  printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard"));
  return 0;
}
#endif
based on coreutils-8.32 2020-09-02 16:47:03 +08:00			`/* Localization of proper names.`
			`Copyright (C) 2006-2020 Free Software Foundation, Inc.`
			`Written by Bruno Haible <bruno@clisp.org>, 2006.`

			`This program is free software: you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 3 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with this program. If not, see <https://www.gnu.org/licenses/>. */`

			`/* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that`
			`the proper_name function might be candidate for attribute 'const' */`
			`#if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) \|\| 4 < __GNUC__`
			`# pragma GCC diagnostic ignored "-Wsuggest-attribute=const"`
			`#endif`

			`#include <config.h>`

			`/* Specification. */`
			`#include "propername.h"`

			`#include <ctype.h>`
			`#include <stdbool.h>`
			`#include <stdio.h>`
			`#include <stdlib.h>`
			`#include <string.h>`
			`#if HAVE_ICONV`
			`# include <iconv.h>`
			`#endif`

			`#include "trim.h"`
			`#include "mbchar.h"`
			`#include "mbuiter.h"`
			`#include "localcharset.h"`
			`#include "c-strcase.h"`
			`#include "xstriconv.h"`
			`#include "xalloc.h"`
			`#include "gettext.h"`


			`/* Tests whether STRING contains trim (SUB), starting and ending at word`
			`boundaries.`
			`Here, instead of implementing Unicode Standard Annex #29 for determining`
			`word boundaries, we assume that trim (SUB) starts and ends with words and`
			`only test whether the part before it ends with a non-word and the part`
			`after it starts with a non-word. */`
			`static bool`
			`mbsstr_trimmed_wordbounded (const char string, const char sub)`
			`{`
			`char *tsub = trim (sub);`
			`bool found = false;`

			`for (; *string != '\0';)`
			`{`
			`const char *tsub_in_string = mbsstr (string, tsub);`
			`if (tsub_in_string == NULL)`
			`break;`
			`else`
			`{`
			`if (MB_CUR_MAX > 1)`
			`{`
			`mbui_iterator_t string_iter;`
			`bool word_boundary_before;`
			`bool word_boundary_after;`

			`mbui_init (string_iter, string);`
			`word_boundary_before = true;`
			`if (mbui_cur_ptr (string_iter) < tsub_in_string)`
			`{`
			`mbchar_t last_char_before_tsub;`
			`do`
			`{`
			`if (!mbui_avail (string_iter))`
			`abort ();`
			`last_char_before_tsub = mbui_cur (string_iter);`
			`mbui_advance (string_iter);`
			`}`
			`while (mbui_cur_ptr (string_iter) < tsub_in_string);`
			`if (mb_isalnum (last_char_before_tsub))`
			`word_boundary_before = false;`
			`}`

			`mbui_init (string_iter, tsub_in_string);`
			`{`
			`mbui_iterator_t tsub_iter;`

			`for (mbui_init (tsub_iter, tsub);`
			`mbui_avail (tsub_iter);`
			`mbui_advance (tsub_iter))`
			`{`
			`if (!mbui_avail (string_iter))`
			`abort ();`
			`mbui_advance (string_iter);`
			`}`
			`}`
			`word_boundary_after = true;`
			`if (mbui_avail (string_iter))`
			`{`
			`mbchar_t first_char_after_tsub = mbui_cur (string_iter);`
			`if (mb_isalnum (first_char_after_tsub))`
			`word_boundary_after = false;`
			`}`

			`if (word_boundary_before && word_boundary_after)`
			`{`
			`found = true;`
			`break;`
			`}`

			`mbui_init (string_iter, tsub_in_string);`
			`if (!mbui_avail (string_iter))`
			`break;`
			`string = tsub_in_string + mb_len (mbui_cur (string_iter));`
			`}`
			`else`
			`{`
			`bool word_boundary_before;`
			`const char *p;`
			`bool word_boundary_after;`

			`word_boundary_before = true;`
			`if (string < tsub_in_string)`
			`if (isalnum ((unsigned char) tsub_in_string[-1]))`
			`word_boundary_before = false;`

			`p = tsub_in_string + strlen (tsub);`
			`word_boundary_after = true;`
			`if (*p != '\0')`
			`if (isalnum ((unsigned char) *p))`
			`word_boundary_after = false;`

			`if (word_boundary_before && word_boundary_after)`
			`{`
			`found = true;`
			`break;`
			`}`

			`if (*tsub_in_string == '\0')`
			`break;`
			`string = tsub_in_string + 1;`
			`}`
			`}`
			`}`
			`free (tsub);`
			`return found;`
			`}`

			`/* Return the localization of NAME. NAME is written in ASCII. */`

			`const char *`
			`proper_name (const char *name)`
			`{`
			`/* See whether there is a translation. */`
			`const char *translation = gettext (name);`

			`if (translation != name)`
			`{`
			`/* See whether the translation contains the original name. */`
			`if (mbsstr_trimmed_wordbounded (translation, name))`
			`return translation;`
			`else`
			`{`
			`/* Return "TRANSLATION (NAME)". */`
			`char *result =`
			`XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);`

			`sprintf (result, "%s (%s)", translation, name);`
			`return result;`
			`}`
			`}`
			`else`
			`return name;`
			`}`

			`/* Return the localization of a name whose original writing is not ASCII.`
			`NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal`
			`escape sequences. NAME_ASCII is a fallback written only with ASCII`
			`characters. */`

			`const char *`
			`proper_name_utf8 (const char name_ascii, const char name_utf8)`
			`{`
			`/* See whether there is a translation. */`
			`const char *translation = gettext (name_ascii);`

			`/* Try to convert NAME_UTF8 to the locale encoding. */`
			`const char *locale_code = locale_charset ();`
			`char *alloc_name_converted = NULL;`
			`char *alloc_name_converted_translit = NULL;`
			`const char *name_converted = NULL;`
			`const char *name_converted_translit = NULL;`
			`const char *name;`

			`if (c_strcasecmp (locale_code, "UTF-8") != 0)`
			`{`
			`#if HAVE_ICONV`
			`name_converted = alloc_name_converted =`
			`xstr_iconv (name_utf8, "UTF-8", locale_code);`

			`# if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) \|\| __GLIBC__ > 2) \`
			`&& !defined __UCLIBC__) \`
			`\|\| _LIBICONV_VERSION >= 0x0105`
			`{`
			`char *converted_translit;`

			`size_t len = strlen (locale_code);`
			`char *locale_code_translit = XNMALLOC (len + 10 + 1, char);`
			`memcpy (locale_code_translit, locale_code, len);`
			`memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1);`

			`converted_translit =`
			`xstr_iconv (name_utf8, "UTF-8", locale_code_translit);`

			`free (locale_code_translit);`

			`if (converted_translit != NULL)`
			`{`
			`# if !_LIBICONV_VERSION`
			`/* Don't use the transliteration if it added question marks.`
			`glibc's transliteration falls back to question marks; libiconv's`
			`transliteration does not.`
			`mbschr is equivalent to strchr in this case. */`
			`if (strchr (converted_translit, '?') != NULL)`
			`free (converted_translit);`
			`else`
			`# endif`
			`name_converted_translit = alloc_name_converted_translit =`
			`converted_translit;`
			`}`
			`}`
			`# endif`
			`#endif`
			`}`
			`else`
			`{`
			`name_converted = name_utf8;`
			`name_converted_translit = name_utf8;`
			`}`

			`/* The name in locale encoding. */`
			`name = (name_converted != NULL ? name_converted :`
			`name_converted_translit != NULL ? name_converted_translit :`
			`name_ascii);`

			`/* See whether we have a translation. Some translators have not understood`
			`that they should use the UTF-8 form of the name, if possible. So if the`
			`translator provided a no-op translation, we ignore it. */`
			`if (strcmp (translation, name_ascii) != 0)`
			`{`
			`/* See whether the translation contains the original name. */`
			`if (mbsstr_trimmed_wordbounded (translation, name_ascii)`
			`\|\| (name_converted != NULL`
			`&& mbsstr_trimmed_wordbounded (translation, name_converted))`
			`\|\| (name_converted_translit != NULL`
			`&& mbsstr_trimmed_wordbounded (translation, name_converted_translit)))`
			`{`
			`if (alloc_name_converted != NULL)`
			`free (alloc_name_converted);`
			`if (alloc_name_converted_translit != NULL)`
			`free (alloc_name_converted_translit);`
			`return translation;`
			`}`
			`else`
			`{`
			`/* Return "TRANSLATION (NAME)". */`
			`char *result =`
			`XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);`

			`sprintf (result, "%s (%s)", translation, name);`

			`if (alloc_name_converted != NULL)`
			`free (alloc_name_converted);`
			`if (alloc_name_converted_translit != NULL)`
			`free (alloc_name_converted_translit);`
			`return result;`
			`}`
			`}`
			`else`
			`{`
			`if (alloc_name_converted != NULL && alloc_name_converted != name)`
			`free (alloc_name_converted);`
			`if (alloc_name_converted_translit != NULL`
			`&& alloc_name_converted_translit != name)`
			`free (alloc_name_converted_translit);`
			`return name;`
			`}`
			`}`

			`#ifdef TEST1`
			`# include <locale.h>`
			`int`
			`main (int argc, char *argv[])`
			`{`
			`setlocale (LC_ALL, "");`
			`if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))`
			`printf("found\n");`
			`return 0;`
			`}`
			`#endif`

			`#ifdef TEST2`
			`# include <locale.h>`
			`# include <stdio.h>`
			`int`
			`main (int argc, char *argv[])`
			`{`
			`setlocale (LC_ALL, "");`
			`printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard"));`
			`return 0;`
			`}`
			`#endif`