527 lines
16 KiB
Text
527 lines
16 KiB
Text
#
|
|
# Copyright (C) 2000, 2001, 2002 Loic Dachary <loic@senga.org>
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
#
|
|
#
|
|
# Generate a table mapping UTF-16 characters to their unaccented
|
|
# equivalent. Some characters such as fi (one character) are expanded
|
|
# into two letters : f and i. In Unicode jargon it means that the table
|
|
# map each character to its compatibility decomposition in which marks
|
|
# were stripped.
|
|
#
|
|
# The format of the $base file can be found at:
|
|
# http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html
|
|
#
|
|
use strict;
|
|
|
|
use Getopt::Long;
|
|
|
|
sub main {
|
|
my($base) = "UnicodeData-@UNICODE_VERSION@.txt";
|
|
my($cfbase) = "CaseFolding-@UNICODE_VERSION@.txt";
|
|
my($verbose);
|
|
my($source);
|
|
my($reference);
|
|
|
|
GetOptions("verbose+" => \$verbose,
|
|
"database=s" => \$base,
|
|
"source!" => \$source,
|
|
"reference!" => \$reference);
|
|
|
|
my(%decomposition, %mark, %name);
|
|
my(%ranges);
|
|
open(FILE, "<$base") or die "cannot open $base for reading : $!";
|
|
while(<FILE>) {
|
|
next if(/^\s*#/); # Skip comments
|
|
my($code_value,
|
|
$character_name,
|
|
$general_category,
|
|
$canonical_combining_classes,
|
|
$bidirectional_category,
|
|
$character_decomposition_mapping,
|
|
$decimal_digit_value,
|
|
$digit_value,
|
|
$numeric_value,
|
|
$mirrored,
|
|
$unicode_1_0_name,
|
|
$_10646_comment_field,
|
|
$uppercase_mapping,
|
|
$lowercase_mapping,
|
|
$titlecase_mapping) = split(/;/, $_);
|
|
|
|
# Basic plane only
|
|
last if (hex $code_value > 0xffff);
|
|
|
|
#
|
|
# Generate ranges of values that are not explicitly listed.
|
|
# CJK ideographs for instance.
|
|
#
|
|
if($character_name =~ /^<(.*), (First|Last)>/) {
|
|
$ranges{$1}{$2} = $code_value;
|
|
}
|
|
|
|
# Test for exceptions to unaccenting. Note that this is
|
|
# mostly based on blocks when it should use the Unicode
|
|
# script property. In practise, for the script concerned,
|
|
# this does not look to be an issue currently
|
|
# (following comment made for japanese but also concerns
|
|
# other exceptions)
|
|
# For kana japanese characters, we don't want to strip
|
|
# accents as I'm told that they are essential and
|
|
# stripping them does not make sense.
|
|
# Problem: the first solution used was to decompose the
|
|
# Japanese accented kana and not remove accents. But then
|
|
# the unaccented character would match the string with
|
|
# accent. So now we don't decompose at all, but this means
|
|
# that, if the original text was decomposed, things don't
|
|
# work as intended as we should actually recombine the
|
|
# letter+accents in this case for data to be unified.
|
|
|
|
# Hiragana + Katakana
|
|
if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff)
|
|
# Halfwidth katakana
|
|
&& !(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f)
|
|
# Hindi Devanagari
|
|
&& !(hex $code_value >= 0x0900 && hex $code_value <= 0x097f)
|
|
&& !(hex $code_value >= 0xa8e0 && hex $code_value <= 0xa8ff)
|
|
# Bengali
|
|
&& !(hex $code_value >= 0x0980 && hex $code_value <= 0x09ff)
|
|
) {
|
|
# If a decomposition exists, record it
|
|
if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
|
|
$decomposition{$code_value} = $2;
|
|
}
|
|
if($general_category =~ /^M/) {
|
|
$mark{$code_value} = 1;
|
|
# For mark caracters, we generate a 0 entry in the
|
|
# decomposition table. This signals to the c code that no
|
|
# output should be generated. Slightly hacky but ok. The
|
|
# original code left mark character go through (generating
|
|
# still accented output if the input was in decomposed
|
|
# form). Decomposed text is rare, but, for example, macosx file
|
|
# names have separate combining accent characters.
|
|
$decomposition{$code_value} = "0000";
|
|
}
|
|
}
|
|
$name{$code_value} = $character_name;
|
|
}
|
|
close(FILE);
|
|
|
|
# Generate compatibility decomposition and strip marks
|
|
# (marks == diacritics == accents)
|
|
#
|
|
# We also forbid any excursion out of the basic plane.
|
|
my($from, $to);
|
|
while(($from, $to) = each(%decomposition)) {
|
|
my(@code_values) = split(' ', $to);
|
|
my($code_value);
|
|
my(@decomposition);
|
|
|
|
while(@code_values) {
|
|
my($code_value) = shift(@code_values);
|
|
if (hex $code_value > 0xffff) {
|
|
undef @decomposition;
|
|
last;
|
|
}
|
|
# marks also have entries in the decomposition table (so that
|
|
# they can be suppressed when found in input), but no output
|
|
# component should be generated for them.
|
|
if (!exists($mark{$code_value})) {
|
|
if(exists($decomposition{$code_value})) {
|
|
push(@code_values, split(' ', $decomposition{$code_value}));
|
|
} else {
|
|
push(@decomposition, $code_value);
|
|
}
|
|
}
|
|
}
|
|
if(@decomposition) {
|
|
$decomposition{$from} = "@decomposition";
|
|
} else {
|
|
delete($decomposition{$from});
|
|
}
|
|
}
|
|
|
|
# Read in the casefolding file
|
|
my(%casefold);
|
|
open(FILE, "<$cfbase") or die "cannot open $cfbase for reading : $!";
|
|
while(<FILE>) {
|
|
next if(/^\s*#/); # Skip comments
|
|
my($code_value,
|
|
$foldstatus,
|
|
$folded) = split(/;/, $_);
|
|
if ($foldstatus =~ /C|F/) {
|
|
$casefold{$code_value} = $folded;
|
|
}
|
|
}
|
|
close(FILE);
|
|
|
|
#showcasefold(\%casefold);
|
|
reference(\%decomposition, $verbose) if($reference);
|
|
source(\%decomposition, \%name, \%casefold, $verbose) if($source);
|
|
}
|
|
|
|
sub showcasefold {
|
|
my($casefold) = @_;
|
|
|
|
my($code_value);
|
|
foreach $code_value (0 .. 0xFFFF) {
|
|
$code_value = uc(sprintf("%04x", $code_value));
|
|
print "$code_value";
|
|
if(exists($casefold->{$code_value})) {
|
|
print " => $casefold->{$code_value}\n";
|
|
} else {
|
|
print "\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
#
|
|
# Generate machine readable file mapping all UTF-16 codes
|
|
# to their unaccented replacement. This file can be compared
|
|
# with the output of a program doing the same mapping using the
|
|
# libunac library.
|
|
#
|
|
sub reference {
|
|
my($decomposition, $verbose) = @_;
|
|
|
|
my($code_value);
|
|
foreach $code_value (0 .. 0xFFFF) {
|
|
$code_value = uc(sprintf("%04x", $code_value));
|
|
print "$code_value";
|
|
if(exists($decomposition->{$code_value})) {
|
|
print " => $decomposition->{$code_value}\n";
|
|
} else {
|
|
print "\n";
|
|
}
|
|
}
|
|
}
|
|
|
|
#
|
|
# Read input file into hash table and return it.
|
|
#
|
|
# The input is divided in chuncks according to special markers. For
|
|
# instance:
|
|
#
|
|
# before
|
|
# /* Generated by builder. Do not modify. Start a_tag */
|
|
# bla bla
|
|
# /* Generated by builder. Do not modify. End a_tag */
|
|
# after
|
|
# /* Generated by builder. Do not modify. Start b_tag */
|
|
# more stuff
|
|
# /* Generated by builder. Do not modify. End b_tag */
|
|
# still something
|
|
#
|
|
# Will generate the following hash:
|
|
#
|
|
# {
|
|
# 'list' => [ 1, a_tag, 2, b_tag, 3 ],
|
|
# '1' => "before\n",
|
|
# 'a_tag' => undef,
|
|
# '2' => "after\n";
|
|
# 'b_tag' => undef,
|
|
# '3' => "still something\n"
|
|
# }
|
|
#
|
|
# The caller may then assign a string to the a_tag and b_tag entries
|
|
# and then call the spit function to rebuild the file.
|
|
#
|
|
sub slurp {
|
|
my($file) = @_;
|
|
my(%content);
|
|
my($count) = 1;
|
|
my(@lines);
|
|
open(FILE, "<$file") or die "cannot open $file for reading : $!";
|
|
while(<FILE>) {
|
|
if(/Do not modify. Start\s+(\w+)/i) {
|
|
push(@{$content{'list'}}, $count);
|
|
$content{$count} = join("", @lines);
|
|
$count++;
|
|
push(@{$content{'list'}}, $1);
|
|
@lines = ();
|
|
}
|
|
next if(/Do not modify. Start/i .. /Do not modify. End/i);
|
|
push(@lines, $_);
|
|
}
|
|
if(@lines) {
|
|
push(@{$content{'list'}}, $count);
|
|
$content{$count} = join("", @lines);
|
|
}
|
|
close(FILE);
|
|
return \%content;
|
|
}
|
|
|
|
#
|
|
# Write the $file with the content of the $content hash table.
|
|
# See the slurp function for a description of the $content format.
|
|
#
|
|
sub spit {
|
|
my($file, $content) = @_;
|
|
open(FILE, ">$file") or die "cannot open $file for writing : $!";
|
|
my($tag);
|
|
foreach $tag (@{$content->{'list'}}) {
|
|
print(FILE "/* Generated by builder. Do not modify. Start $tag */\n") if($tag !~ /^\d+$/);
|
|
print FILE $content->{$tag};
|
|
print(FILE "/* Generated by builder. Do not modify. End $tag */\n") if($tag !~ /^\d+$/);
|
|
}
|
|
close(FILE);
|
|
}
|
|
|
|
#
|
|
# Generate tables, defines and code in the unac.c and unac.h files.
|
|
# The unac.c and unac.h files are substituted in place.
|
|
#
|
|
sub source {
|
|
my($decomposition, $name, $casefold, $verbose) = @_;
|
|
|
|
my($csource) = slurp("unac.c");
|
|
my($hsource) = slurp("unac.h");
|
|
#
|
|
# Human readable table
|
|
#
|
|
my(@comment);
|
|
push(@comment, "/*\n");
|
|
my($from);
|
|
foreach $from (sort(keys(%$decomposition))) {
|
|
my($character_name) = $name->{$from};
|
|
$character_name = "??" if(!$character_name);
|
|
push(@comment, " * $from $character_name\n");
|
|
my($code_value);
|
|
foreach $code_value (split(' ', $decomposition->{$from})) {
|
|
$character_name = $name->{$code_value} || "??";
|
|
push(@comment, " * \t$code_value $character_name\n");
|
|
}
|
|
}
|
|
push(@comment, "*/\n");
|
|
my($comment) = join("", @comment);
|
|
|
|
#
|
|
# Select the best block size (the one that takes less space)
|
|
#
|
|
# result: $best_blocks (array of blocks that contain exactly
|
|
# $block_count replacements. Each block
|
|
# is a string containing replacements
|
|
# separated by |)
|
|
# $best_indexes (array mapping block number to a block
|
|
# in the $best_blocks array)
|
|
# $best_block_shift (the size of the block)
|
|
#
|
|
# Within a block, if the character has no replacement the 0xFFFF
|
|
# placeholder is inserted.
|
|
#
|
|
my($best_blocks);
|
|
my($best_indexes);
|
|
my($best_block_shift);
|
|
my($best_total_size) = 10 * 1024 * 1024;
|
|
my($block_shift);
|
|
foreach $block_shift (2 .. 10) {
|
|
my($block_count) = 1 << $block_shift;
|
|
my(@blocks, @indexes);
|
|
my($duplicate) = 0;
|
|
my(@values);
|
|
my($code_value);
|
|
foreach $code_value (0 .. 0x10000) {
|
|
if($code_value > 0 && $code_value % $block_count == 0) {
|
|
my($block) = join("|", @values);
|
|
my($existing_block);
|
|
my($index) = 0;
|
|
my($found);
|
|
foreach $existing_block (@blocks) {
|
|
if($block eq $existing_block) {
|
|
push(@indexes, $index);
|
|
$found = 1;
|
|
$duplicate++;
|
|
last;
|
|
}
|
|
$index++;
|
|
}
|
|
if(!$found) {
|
|
push(@indexes, $index);
|
|
push(@blocks, $block);
|
|
}
|
|
@values = ();
|
|
}
|
|
$code_value = uc(sprintf("%04x", $code_value));
|
|
#print "$code_value UNAC ";
|
|
if(exists($decomposition->{$code_value})) {
|
|
push(@values, $decomposition->{$code_value});
|
|
#print "$decomposition->{$code_value} ";
|
|
} else {
|
|
push(@values, "FFFF");
|
|
#print "FFFF ";
|
|
}
|
|
# We push both the case-folded version of the unaccented char
|
|
# and the case-folded version of the original one. This
|
|
# makes the table a little bigger, but allows
|
|
# independantly unaccenting, folding or both
|
|
#print "UNACFOLD ";
|
|
if(exists($decomposition->{$code_value})) {
|
|
my($cv);
|
|
my(@vl);
|
|
foreach $cv (split(' ', $decomposition->{$code_value})) {
|
|
if(exists($casefold->{$cv})) {
|
|
push(@vl, $casefold->{$cv});
|
|
#print "$casefold->{$cv} ";
|
|
} else {
|
|
push(@vl, $cv);
|
|
#print "$cv ";
|
|
}
|
|
}
|
|
push(@values, join(" ", @vl));
|
|
} else {
|
|
if(exists($casefold->{$code_value})) {
|
|
push(@values, $casefold->{$code_value});
|
|
#print "$casefold->{$code_value} ";
|
|
} else {
|
|
push(@values, "FFFF");
|
|
#print "FFFF ";
|
|
}
|
|
}
|
|
#print "FOLD ";
|
|
if(exists($casefold->{$code_value})) {
|
|
push(@values, $casefold->{$code_value});
|
|
#print "$casefold->{$code_value} ";
|
|
} else {
|
|
push(@values, "FFFF");
|
|
#print "FFFF ";
|
|
}
|
|
#print "\n";
|
|
}
|
|
print STDERR scalar(@blocks) . " blocks of " . $block_count . " entries, factorized $duplicate blocks\n\t" if($verbose);
|
|
my($block_size) = 0;
|
|
my($block);
|
|
foreach $block (@blocks) {
|
|
my(@tmp) = split(/[| ]/, $block);
|
|
$block_size += scalar(@tmp) * 2;
|
|
}
|
|
#
|
|
# Pointer to the block array
|
|
#
|
|
$block_size += scalar(@blocks) * 4;
|
|
#
|
|
# Positions of the entries in the block
|
|
#
|
|
$block_size += $block_count * scalar(@blocks) * 2;
|
|
print STDERR "total block size = $block_size, " if($verbose);
|
|
my($index_size) = (1 << (16 - $block_shift)) * 2;
|
|
print STDERR "index size = " . $index_size . "\n\t" if($verbose);
|
|
my($total_size) = $block_size + $index_size;
|
|
print STDERR "total size = $total_size\n" if($verbose);
|
|
|
|
if($total_size < $best_total_size) {
|
|
$best_total_size = $total_size;
|
|
$best_blocks = \@blocks;
|
|
$best_indexes = \@indexes;
|
|
$best_block_shift = $block_shift;
|
|
}
|
|
}
|
|
|
|
my($block_count) = scalar(@$best_blocks);
|
|
my($block_size) = 1 << $best_block_shift;
|
|
|
|
#
|
|
# Constants that depend on the block size.
|
|
# result : $defines
|
|
#
|
|
my($defines) = <<EOF;
|
|
#define UNAC_BLOCK_SHIFT $best_block_shift
|
|
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
|
|
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
|
|
#define UNAC_BLOCK_COUNT $block_count
|
|
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
|
|
EOF
|
|
#
|
|
# Mapping block number to index in data_table or position table.
|
|
# result : $index_out
|
|
#
|
|
my($count) = 0;
|
|
my($index);
|
|
my($index_out) = "unsigned short unac_indexes[UNAC_INDEXES_SIZE] = {\n";
|
|
foreach $index (@$best_indexes) {
|
|
$count++;
|
|
$index_out .= sprintf("%4s,", $index);
|
|
if($count % 15 == 0) {
|
|
$index_out .= "\n";
|
|
}
|
|
}
|
|
$index_out =~ s/,\s*\Z/\n/s;
|
|
$index_out .= "};\n";
|
|
|
|
#
|
|
# Generate the position table (map character number in block to
|
|
# position in the data string), the data_table that maps a block
|
|
# to a unsigned short array that contains the character (aka the
|
|
# data array) and the data arrays themselves that is a pure concatenation
|
|
# of all the unsigned short in a block.
|
|
# result : $position_out, $data_table_out, $data_out
|
|
#
|
|
my(@positions_out);
|
|
my($highest_position) = 0;
|
|
my(@data_table_out);
|
|
my($data_table_out) = "unsigned short* unac_data_table[UNAC_BLOCK_COUNT] = {\n";
|
|
my(@data_out);
|
|
my($block_number) = 0;
|
|
my($block);
|
|
foreach $block (@$best_blocks) {
|
|
my(@index);
|
|
my($position) = 0;
|
|
my($entry);
|
|
my(@data);
|
|
foreach $entry (split('\|', $block)) {
|
|
push(@index, $position);
|
|
my(@tmp) = split(' ', $entry);
|
|
push(@data, @tmp);
|
|
$position += scalar(@tmp);
|
|
}
|
|
push(@index, $position);
|
|
$highest_position = $position if($position > $highest_position);
|
|
push(@positions_out, "/* $block_number */ { " . join(", ", @index) . " }");
|
|
push(@data_table_out, "unac_data$block_number");
|
|
push(@data_out, "unsigned short unac_data$block_number" . "[] = { 0x" . join(", 0x", @data) . " };\n");
|
|
$block_number++;
|
|
}
|
|
my($position_type) = $highest_position >= 256 ? "short" : "char";
|
|
my($positions_out) = "unsigned $position_type unac_positions[UNAC_BLOCK_COUNT][3*UNAC_BLOCK_SIZE + 1] = {\n";
|
|
|
|
$positions_out .= join(",\n", @positions_out);
|
|
$positions_out .= "\n};\n";
|
|
my($data_out) = join("", @data_out);
|
|
$data_table_out .= join(",\n", @data_table_out);
|
|
$data_table_out .= "\n};\n";
|
|
|
|
#
|
|
# Tables declarations
|
|
# result : $declarations
|
|
#
|
|
my($declarations);
|
|
$declarations = <<EOF;
|
|
extern unsigned short unac_indexes[UNAC_INDEXES_SIZE];
|
|
extern unsigned $position_type unac_positions[UNAC_BLOCK_COUNT][3*UNAC_BLOCK_SIZE + 1];
|
|
extern unsigned short* unac_data_table[UNAC_BLOCK_COUNT];
|
|
EOF
|
|
for($block_number = 0; $block_number < $block_count; $block_number++) {
|
|
$declarations .= "extern unsigned short unac_data$block_number" . "[];\n";
|
|
}
|
|
|
|
$csource->{'tables'} = "$comment\n$index_out\n$positions_out\n$data_out\n$data_table_out";
|
|
$hsource->{'defines'} = $defines;
|
|
$hsource->{'declarations'} = $declarations;
|
|
|
|
spit("unac.c", $csource);
|
|
spit("unac.h", $hsource);
|
|
}
|
|
|
|
main();
|