390 lines
11 KiB
Groff
390 lines
11 KiB
Groff
'''
|
||
''' Copyright (C) 2000, 2001, 2002 Loic Dachary <loic@senga.org>
|
||
'''
|
||
''' This program is free software; you can redistribute it and/or modify
|
||
''' it under the terms of the GNU General Public License as published by
|
||
''' the Free Software Foundation; either version 2 of the License, or
|
||
''' (at your option) any later version.
|
||
'''
|
||
''' This program is distributed in the hope that it will be useful,
|
||
''' but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
''' MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
''' GNU General Public License for more details.
|
||
'''
|
||
''' You should have received a copy of the GNU General Public License
|
||
''' along with this program; if not, write to the Free Software
|
||
''' Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||
'''
|
||
.TH unac 3 local
|
||
.SH NAME
|
||
unac \- remove accents from string or character
|
||
|
||
.SH SYNOPSIS
|
||
.nf
|
||
.ft CW
|
||
#include <unac.h>
|
||
|
||
const char* unac_version();
|
||
|
||
int unac_string(const char* charset,
|
||
const char* in, int in_length,
|
||
char** out, int* out_length);
|
||
|
||
int unac_string_utf16(const char* in, int in_length,
|
||
char** out, int* out_length);
|
||
|
||
/* MACRO: side effect on unaccented and length */
|
||
unac_char_utf16(unsigned short c,
|
||
unsigned short* unaccented,
|
||
int length);
|
||
|
||
const char* unac_version()
|
||
|
||
/*
|
||
* level: UNAC_DEBUG_NONE UNAC_DEBUG_LOW UNAC_DEBUG_HIGH
|
||
*/
|
||
void unac_debug(int level)
|
||
|
||
/* typedef void (*unac_debug_print_t)(const char* message, void* data); */
|
||
void unac_debug_callback(int level, unac_debug_print_t function, void* data)
|
||
|
||
.ft R
|
||
.fi
|
||
|
||
.SH DESCRIPTION
|
||
.I unac
|
||
is a C library that removes accents from characters, regardless of the
|
||
character set (ISO-8859-15, ISO-CELTIC, KOI8-RU...) as long as
|
||
.I iconv(3)
|
||
is able to convert it into UTF-16 (Unicode).
|
||
|
||
The
|
||
.B unac_string
|
||
function is given a charset (ISO-8859-15 for instance) and a string. It
|
||
converts the string into UTF-16 and calls the
|
||
.B unac_string_utf16
|
||
function to remove all accents from the UTF-16 version. The unaccented
|
||
string is then converted into the original charset (ISO-8859-15 for
|
||
instance) and sent back to the caller of
|
||
.B unac_string.
|
||
|
||
.I unac
|
||
does a little more than removing accents:
|
||
every character that is made of two character such as
|
||
.B <EFBFBD>
|
||
(ISO-8859-15 octal code 346)
|
||
will be expanded in two characters
|
||
.B a
|
||
and
|
||
.B e.
|
||
Should a character be made of three characters, it would be
|
||
decomposed in the same way.
|
||
|
||
The conversion from and to UTF-16 is
|
||
done with
|
||
.I iconv(3).
|
||
The
|
||
.B iconv -l
|
||
will list all available charsets. Using UTF-16 as a pivot implies an
|
||
overhead but ensures that accents can be removed from all character
|
||
for which there is an equivalent character in Unicode.
|
||
|
||
.B unac_char_utf16
|
||
is a macro that returns a pointer to the unaccented equivalent
|
||
of any UTF-16 character. It is the basic building block of
|
||
.I unac.
|
||
|
||
.B unac_string_utf16
|
||
repeatidly applies the
|
||
.B unac_char_utf16
|
||
function on each character of an UTF-16 string.
|
||
|
||
.SH FUNCTIONS
|
||
|
||
.TP
|
||
.B int unac_string(const char* charset, const char* in, int in_length, char** out, int* out_length)
|
||
|
||
Returns the unaccented equivalent of the string
|
||
.B 'in'
|
||
of length
|
||
.B 'in_length'.
|
||
The returned string is stored in the pointer pointed by the
|
||
.B 'out'
|
||
argument and the length of the string is stored in the integer pointed by
|
||
the
|
||
.B 'out_length '
|
||
argument.
|
||
If the
|
||
.B '*out'
|
||
pointer is not null, it must point to an area allocated by
|
||
.I malloc(3)
|
||
and the length of the array must be specified in the
|
||
.B '*out_length'
|
||
argument. Both arguments
|
||
.B '*out'
|
||
and
|
||
.B '*out_length'
|
||
will be replaced with the return values when the function returns
|
||
on success. The
|
||
.B '*out'
|
||
pointer may point to a memory location that has been reallocated
|
||
by the
|
||
.B unac_string
|
||
function. There is no guarantee that '*out' is identical to the
|
||
value given by the caller and the malloc'ed memory location given
|
||
by the caller may not be useable when the function returns (either
|
||
error or success).
|
||
If the
|
||
.B '*out'
|
||
pointer is null, the
|
||
.B unac_string
|
||
function allocates a new memory block using
|
||
.I malloc(3).
|
||
It is the responsibility of the caller to deallocate the area returned
|
||
in the
|
||
.B '*out'
|
||
pointer.
|
||
|
||
The return value of
|
||
.B unac_string
|
||
is 0 on success and -1 on error, in which case
|
||
the errno variable is set to the corresponding error code. See
|
||
ERROR section below for more information, the
|
||
.I iconv(3)
|
||
manual page may also help.
|
||
|
||
.TP
|
||
int unac_string_utf16(const char* in, int in_length, char** out, int* out_length)
|
||
|
||
Has the same effect as
|
||
.B unac_string("UTF-16", in, in_length, out, out_length).
|
||
Since the
|
||
.B unac_string_utf16
|
||
is the backend function of the
|
||
.B unac_string
|
||
function it is more efficient because no charset conversion of the
|
||
input string (from and to UTF-16) is necessary.
|
||
|
||
.TP
|
||
unac_char_utf16(const unsigned short c, unsigned short* p, int l)
|
||
|
||
.B Warning: this is a macro, each argument may be evaluated more than once.
|
||
Returns the unaccented equivalent of the UTF-16 character
|
||
.B 'c'
|
||
in the pointer
|
||
.B 'p'.
|
||
The length of the unsigned short array pointed by
|
||
.B 'p'
|
||
is returned in the
|
||
.B 'l'
|
||
argument.
|
||
|
||
.TP
|
||
.B const char* unac_version()
|
||
|
||
Return the version number of
|
||
.B unac.
|
||
|
||
.TP
|
||
.B void unac_debug(int level)
|
||
Set the debug level of the unac library to
|
||
.B 'level'.
|
||
Possible values are: UNAC_DEBUG_NONE for no debug at all, UNAC_DEBUG_LOW
|
||
for terse human readable information, UNAC_DEBUG_HIGH for very detailed
|
||
information only usable when translating a few strings.
|
||
|
||
unac_debug_callback with anything but UNAC_DEBUG_NONE is not
|
||
thread safe.
|
||
|
||
.TP
|
||
void unac_debug_callback(int level, unac_debug_print_t function, void* data)
|
||
|
||
Set the debug level and define a printing function callback.
|
||
The
|
||
.B 'level'
|
||
is the same as in unac_debug. The
|
||
.B 'function'
|
||
is in charge of dealing with the debug messages,
|
||
presumably to print them to the user.
|
||
The
|
||
.B 'data'
|
||
is an opaque pointer that is passed along to
|
||
.B function, should it
|
||
need to manage a persistent context.
|
||
|
||
The prototype of
|
||
.B 'function'
|
||
accepts two arguments. The first
|
||
is the debug message
|
||
.I (const char*),
|
||
the second is the opaque
|
||
pointer given as
|
||
.B 'data'
|
||
argument to
|
||
.B unac_debug_callback.
|
||
|
||
If
|
||
.B 'function'
|
||
is NULL, messages are printed on the standard
|
||
error output using fprintf(stderr...).
|
||
|
||
unac_debug_callback with anything but UNAC_DEBUG_NONE is not
|
||
thread safe.
|
||
.B
|
||
.SH ERRORS
|
||
|
||
.TP
|
||
EINVAL
|
||
|
||
the requested conversion pair is not available. For instance, when
|
||
specifying the ISO-0000 charset (imaginary), it means it is not possible to
|
||
convert from ISO-0000 to UTF-16.
|
||
|
||
.SH EXAMPLES
|
||
Convert the
|
||
.B <EFBFBD>t<EFBFBD>
|
||
string into
|
||
.B ete.
|
||
.nf
|
||
.ft CW
|
||
#include <unac.h>
|
||
|
||
char* out = 0;
|
||
int out_length = 0;
|
||
if(unac_string("ISO-8859-1", "<22>t<EFBFBD>", strlen("<22>t<EFBFBD>"), &out, &out_length)) {
|
||
perror("unac_string");
|
||
} else {
|
||
printf("%.*s\n", out_length, out);
|
||
free(out);
|
||
}
|
||
.ft R
|
||
.fi
|
||
|
||
.SH IMPLEMENTATION NOTES
|
||
|
||
The endianess of the UTF-16 strings manipulated by
|
||
.I unac
|
||
must always be big endian. When using
|
||
.I iconv(3)
|
||
to translate strings, UTF-16BE should be used instead of UTF-16 to make sure
|
||
it is big endian (BE). On some systems where UTF-16BE is not available,
|
||
.B unac
|
||
will rely on UTF-16 and hope it is properly big endian encoded.
|
||
For more information check RFC2781 (http://www.faqs.org/rfcs/rfc2781.html: UTF-16,
|
||
an encoding of ISO 10646).
|
||
|
||
The
|
||
.I unac
|
||
library uses the Unicode database to map accented letters to their unaccented
|
||
equivalent. Mapping tables are generated from the
|
||
.I UnicodeData-3.2.0.txt
|
||
file (as found at http://www.unicode.org/Public/3.2-Update/) by the
|
||
.I builder
|
||
perl script. The
|
||
.I builder
|
||
script inserts these tables in the
|
||
.I unac.h
|
||
and
|
||
.I unac.c
|
||
files, replacing the existing ones. Looking for the
|
||
.B 'Generated by builder'
|
||
string in the
|
||
.I unac.[ch]
|
||
files allows to spot the various parts handled by the builder script.
|
||
|
||
The library data occupies 25KB where a simple minded table would
|
||
occupy around 512Kbytes. The idea used to compress the tables is that
|
||
many Unicode characters do not have unaccented equivalent. Instead of
|
||
relying on a table mapping each Unicode character to the corresponding
|
||
unaccented character, an intermediate array of pointers is created. In
|
||
the drawing below, the range of UTF-16 character is not accurate but
|
||
illustrates the method. The
|
||
.B unac_data_table
|
||
points to a set of
|
||
.B unac_dataXX
|
||
arrays. Each pointer covers a range of UTF-16 characters (4 in the example
|
||
below). When a range of character does not contain any accented character,
|
||
unac_data_table always points to the same array : unac_data0. Since there
|
||
are many characters without accents, this is enough to achieve a good
|
||
compression.
|
||
.nf
|
||
.ft CW
|
||
|
||
|
||
unac_data15 unac_data16
|
||
[ NULL, NULL, NULL, e ] <----\ /------> [ a, NULL, NULL, NULL ]
|
||
| |
|
||
| |
|
||
^ ^
|
||
|-----| |-----| |-----| |-----| |-----| |-----|
|
||
[ ... a b c d e f g h i j k <20> <20> 0 1 2 3 4 5 6 7 8 9 A... ] unac_data_table
|
||
|-----| |-----| |-----| |-----| |-----| |-----|
|
||
v v v v
|
||
| | | |
|
||
| | | |
|
||
\--------------------------------------/
|
||
|
|
||
V
|
||
[ NULL, NULL, NULL, NULL ]
|
||
unac_data0
|
||
|
||
|
||
.ft R
|
||
.fi
|
||
Beside this simple optimization, a table (unac_positions) listing the
|
||
actual position of the unaccented replacement within a block
|
||
(unac_dataXX) is necessary because they are not of fixed length. Some
|
||
characters such as
|
||
.B <EFBFBD>
|
||
will
|
||
be replaced by two characters
|
||
.B a
|
||
and
|
||
.B e
|
||
therefore unac_dataXX has a variable size.
|
||
|
||
The unaccented equivalent of an UTF-16 character is calculated by
|
||
applying a
|
||
.I compatibility decomposition
|
||
and then stripping all characters that belong to the
|
||
.I mark
|
||
category. For a precise definition see the Unicode-3.2 normalization
|
||
forms at http://www.unicode.org/unicode/reports/tr28/.
|
||
|
||
All original Unicode data files were taken from
|
||
http://www.unicode.org/Public and prepended with the
|
||
.B UCD Terms of Use.
|
||
|
||
.SH BUGS
|
||
The input string must not contain partially formed characters, there is
|
||
no support for this case.
|
||
|
||
UTF-16 surrogates are not handled.
|
||
|
||
Unicode may contain bugs in the decomposition of characters. When
|
||
you suspect such a bug on a given string, add a test case with the
|
||
faulty string in the
|
||
.B t_unac.in
|
||
test script in the source distribution and run
|
||
make check. It will describe, in a very verbose way, how the string was
|
||
unaccented. You may then fix the UnicodeData-3.2.0.txt file and
|
||
run make check again to make sure the problem is solved. Please
|
||
send such fixes to the author and to the Unicode consortium.
|
||
|
||
.SH SEE ALSO
|
||
unaccent(1), iconv(3)
|
||
.nf
|
||
.ft CW
|
||
http://www.unicode.org/
|
||
http://oss.software.ibm.com/icu/
|
||
http://www.gnu.org/manual/glibc-2.2.5/libc.html
|
||
.ft R
|
||
.fi
|
||
|
||
.SH AUTHOR
|
||
Loic Dachary loic@senga.org
|
||
.nf
|
||
.ft CW
|
||
http://www.senga.org/unac/
|
||
.ft R
|
||
.fi
|