# # Copyright (C) 2000, 2001, 2002 Loic Dachary # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # # Generate a table mapping UTF-16 characters to their unaccented # equivalent. Some characters such as fi (one character) are expanded # into two letters : f and i. In Unicode jargon it means that the table # map each character to its compatibility decomposition in which marks # were stripped. # # The format of the $base file can be found at: # http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html # use strict; use Getopt::Long; sub main { my($base) = "UnicodeData-@UNICODE_VERSION@.txt"; my($cfbase) = "CaseFolding-@UNICODE_VERSION@.txt"; my($verbose); my($source); my($reference); GetOptions("verbose+" => \$verbose, "database=s" => \$base, "source!" => \$source, "reference!" => \$reference); my(%decomposition, %mark, %name); my(%ranges); open(FILE, "<$base") or die "cannot open $base for reading : $!"; while() { next if(/^\s*#/); # Skip comments my($code_value, $character_name, $general_category, $canonical_combining_classes, $bidirectional_category, $character_decomposition_mapping, $decimal_digit_value, $digit_value, $numeric_value, $mirrored, $unicode_1_0_name, $_10646_comment_field, $uppercase_mapping, $lowercase_mapping, $titlecase_mapping) = split(/;/, $_); # Basic plane only last if (hex $code_value > 0xffff); # # Generate ranges of values that are not explicitly listed. # CJK ideographs for instance. # if($character_name =~ /^<(.*), (First|Last)>/) { $ranges{$1}{$2} = $code_value; } # Test for exceptions to unaccenting. Note that this is # mostly based on blocks when it should use the Unicode # script property. In practise, for the script concerned, # this does not look to be an issue currently # (following comment made for japanese but also concerns # other exceptions) # For kana japanese characters, we don't want to strip # accents as I'm told that they are essential and # stripping them does not make sense. # Problem: the first solution used was to decompose the # Japanese accented kana and not remove accents. But then # the unaccented character would match the string with # accent. So now we don't decompose at all, but this means # that, if the original text was decomposed, things don't # work as intended as we should actually recombine the # letter+accents in this case for data to be unified. # Hiragana + Katakana if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff) # Halfwidth katakana && !(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f) # Hindi Devanagari && !(hex $code_value >= 0x0900 && hex $code_value <= 0x097f) && !(hex $code_value >= 0xa8e0 && hex $code_value <= 0xa8ff) # Bengali && !(hex $code_value >= 0x0980 && hex $code_value <= 0x09ff) ) { # If a decomposition exists, record it if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) { $decomposition{$code_value} = $2; } if($general_category =~ /^M/) { $mark{$code_value} = 1; # For mark caracters, we generate a 0 entry in the # decomposition table. This signals to the c code that no # output should be generated. Slightly hacky but ok. The # original code left mark character go through (generating # still accented output if the input was in decomposed # form). Decomposed text is rare, but, for example, macosx file # names have separate combining accent characters. $decomposition{$code_value} = "0000"; } } $name{$code_value} = $character_name; } close(FILE); # Generate compatibility decomposition and strip marks # (marks == diacritics == accents) # # We also forbid any excursion out of the basic plane. my($from, $to); while(($from, $to) = each(%decomposition)) { my(@code_values) = split(' ', $to); my($code_value); my(@decomposition); while(@code_values) { my($code_value) = shift(@code_values); if (hex $code_value > 0xffff) { undef @decomposition; last; } # marks also have entries in the decomposition table (so that # they can be suppressed when found in input), but no output # component should be generated for them. if (!exists($mark{$code_value})) { if(exists($decomposition{$code_value})) { push(@code_values, split(' ', $decomposition{$code_value})); } else { push(@decomposition, $code_value); } } } if(@decomposition) { $decomposition{$from} = "@decomposition"; } else { delete($decomposition{$from}); } } # Read in the casefolding file my(%casefold); open(FILE, "<$cfbase") or die "cannot open $cfbase for reading : $!"; while() { next if(/^\s*#/); # Skip comments my($code_value, $foldstatus, $folded) = split(/;/, $_); if ($foldstatus =~ /C|F/) { $casefold{$code_value} = $folded; } } close(FILE); #showcasefold(\%casefold); reference(\%decomposition, $verbose) if($reference); source(\%decomposition, \%name, \%casefold, $verbose) if($source); } sub showcasefold { my($casefold) = @_; my($code_value); foreach $code_value (0 .. 0xFFFF) { $code_value = uc(sprintf("%04x", $code_value)); print "$code_value"; if(exists($casefold->{$code_value})) { print " => $casefold->{$code_value}\n"; } else { print "\n"; } } } # # Generate machine readable file mapping all UTF-16 codes # to their unaccented replacement. This file can be compared # with the output of a program doing the same mapping using the # libunac library. # sub reference { my($decomposition, $verbose) = @_; my($code_value); foreach $code_value (0 .. 0xFFFF) { $code_value = uc(sprintf("%04x", $code_value)); print "$code_value"; if(exists($decomposition->{$code_value})) { print " => $decomposition->{$code_value}\n"; } else { print "\n"; } } } # # Read input file into hash table and return it. # # The input is divided in chuncks according to special markers. For # instance: # # before # /* Generated by builder. Do not modify. Start a_tag */ # bla bla # /* Generated by builder. Do not modify. End a_tag */ # after # /* Generated by builder. Do not modify. Start b_tag */ # more stuff # /* Generated by builder. Do not modify. End b_tag */ # still something # # Will generate the following hash: # # { # 'list' => [ 1, a_tag, 2, b_tag, 3 ], # '1' => "before\n", # 'a_tag' => undef, # '2' => "after\n"; # 'b_tag' => undef, # '3' => "still something\n" # } # # The caller may then assign a string to the a_tag and b_tag entries # and then call the spit function to rebuild the file. # sub slurp { my($file) = @_; my(%content); my($count) = 1; my(@lines); open(FILE, "<$file") or die "cannot open $file for reading : $!"; while() { if(/Do not modify. Start\s+(\w+)/i) { push(@{$content{'list'}}, $count); $content{$count} = join("", @lines); $count++; push(@{$content{'list'}}, $1); @lines = (); } next if(/Do not modify. Start/i .. /Do not modify. End/i); push(@lines, $_); } if(@lines) { push(@{$content{'list'}}, $count); $content{$count} = join("", @lines); } close(FILE); return \%content; } # # Write the $file with the content of the $content hash table. # See the slurp function for a description of the $content format. # sub spit { my($file, $content) = @_; open(FILE, ">$file") or die "cannot open $file for writing : $!"; my($tag); foreach $tag (@{$content->{'list'}}) { print(FILE "/* Generated by builder. Do not modify. Start $tag */\n") if($tag !~ /^\d+$/); print FILE $content->{$tag}; print(FILE "/* Generated by builder. Do not modify. End $tag */\n") if($tag !~ /^\d+$/); } close(FILE); } # # Generate tables, defines and code in the unac.c and unac.h files. # The unac.c and unac.h files are substituted in place. # sub source { my($decomposition, $name, $casefold, $verbose) = @_; my($csource) = slurp("unac.c"); my($hsource) = slurp("unac.h"); # # Human readable table # my(@comment); push(@comment, "/*\n"); my($from); foreach $from (sort(keys(%$decomposition))) { my($character_name) = $name->{$from}; $character_name = "??" if(!$character_name); push(@comment, " * $from $character_name\n"); my($code_value); foreach $code_value (split(' ', $decomposition->{$from})) { $character_name = $name->{$code_value} || "??"; push(@comment, " * \t$code_value $character_name\n"); } } push(@comment, "*/\n"); my($comment) = join("", @comment); # # Select the best block size (the one that takes less space) # # result: $best_blocks (array of blocks that contain exactly # $block_count replacements. Each block # is a string containing replacements # separated by |) # $best_indexes (array mapping block number to a block # in the $best_blocks array) # $best_block_shift (the size of the block) # # Within a block, if the character has no replacement the 0xFFFF # placeholder is inserted. # my($best_blocks); my($best_indexes); my($best_block_shift); my($best_total_size) = 10 * 1024 * 1024; my($block_shift); foreach $block_shift (2 .. 10) { my($block_count) = 1 << $block_shift; my(@blocks, @indexes); my($duplicate) = 0; my(@values); my($code_value); foreach $code_value (0 .. 0x10000) { if($code_value > 0 && $code_value % $block_count == 0) { my($block) = join("|", @values); my($existing_block); my($index) = 0; my($found); foreach $existing_block (@blocks) { if($block eq $existing_block) { push(@indexes, $index); $found = 1; $duplicate++; last; } $index++; } if(!$found) { push(@indexes, $index); push(@blocks, $block); } @values = (); } $code_value = uc(sprintf("%04x", $code_value)); #print "$code_value UNAC "; if(exists($decomposition->{$code_value})) { push(@values, $decomposition->{$code_value}); #print "$decomposition->{$code_value} "; } else { push(@values, "FFFF"); #print "FFFF "; } # We push both the case-folded version of the unaccented char # and the case-folded version of the original one. This # makes the table a little bigger, but allows # independantly unaccenting, folding or both #print "UNACFOLD "; if(exists($decomposition->{$code_value})) { my($cv); my(@vl); foreach $cv (split(' ', $decomposition->{$code_value})) { if(exists($casefold->{$cv})) { push(@vl, $casefold->{$cv}); #print "$casefold->{$cv} "; } else { push(@vl, $cv); #print "$cv "; } } push(@values, join(" ", @vl)); } else { if(exists($casefold->{$code_value})) { push(@values, $casefold->{$code_value}); #print "$casefold->{$code_value} "; } else { push(@values, "FFFF"); #print "FFFF "; } } #print "FOLD "; if(exists($casefold->{$code_value})) { push(@values, $casefold->{$code_value}); #print "$casefold->{$code_value} "; } else { push(@values, "FFFF"); #print "FFFF "; } #print "\n"; } print STDERR scalar(@blocks) . " blocks of " . $block_count . " entries, factorized $duplicate blocks\n\t" if($verbose); my($block_size) = 0; my($block); foreach $block (@blocks) { my(@tmp) = split(/[| ]/, $block); $block_size += scalar(@tmp) * 2; } # # Pointer to the block array # $block_size += scalar(@blocks) * 4; # # Positions of the entries in the block # $block_size += $block_count * scalar(@blocks) * 2; print STDERR "total block size = $block_size, " if($verbose); my($index_size) = (1 << (16 - $block_shift)) * 2; print STDERR "index size = " . $index_size . "\n\t" if($verbose); my($total_size) = $block_size + $index_size; print STDERR "total size = $total_size\n" if($verbose); if($total_size < $best_total_size) { $best_total_size = $total_size; $best_blocks = \@blocks; $best_indexes = \@indexes; $best_block_shift = $block_shift; } } my($block_count) = scalar(@$best_blocks); my($block_size) = 1 << $best_block_shift; # # Constants that depend on the block size. # result : $defines # my($defines) = <> UNAC_BLOCK_SHIFT) EOF # # Mapping block number to index in data_table or position table. # result : $index_out # my($count) = 0; my($index); my($index_out) = "unsigned short unac_indexes[UNAC_INDEXES_SIZE] = {\n"; foreach $index (@$best_indexes) { $count++; $index_out .= sprintf("%4s,", $index); if($count % 15 == 0) { $index_out .= "\n"; } } $index_out =~ s/,\s*\Z/\n/s; $index_out .= "};\n"; # # Generate the position table (map character number in block to # position in the data string), the data_table that maps a block # to a unsigned short array that contains the character (aka the # data array) and the data arrays themselves that is a pure concatenation # of all the unsigned short in a block. # result : $position_out, $data_table_out, $data_out # my(@positions_out); my($highest_position) = 0; my(@data_table_out); my($data_table_out) = "unsigned short* unac_data_table[UNAC_BLOCK_COUNT] = {\n"; my(@data_out); my($block_number) = 0; my($block); foreach $block (@$best_blocks) { my(@index); my($position) = 0; my($entry); my(@data); foreach $entry (split('\|', $block)) { push(@index, $position); my(@tmp) = split(' ', $entry); push(@data, @tmp); $position += scalar(@tmp); } push(@index, $position); $highest_position = $position if($position > $highest_position); push(@positions_out, "/* $block_number */ { " . join(", ", @index) . " }"); push(@data_table_out, "unac_data$block_number"); push(@data_out, "unsigned short unac_data$block_number" . "[] = { 0x" . join(", 0x", @data) . " };\n"); $block_number++; } my($position_type) = $highest_position >= 256 ? "short" : "char"; my($positions_out) = "unsigned $position_type unac_positions[UNAC_BLOCK_COUNT][3*UNAC_BLOCK_SIZE + 1] = {\n"; $positions_out .= join(",\n", @positions_out); $positions_out .= "\n};\n"; my($data_out) = join("", @data_out); $data_table_out .= join(",\n", @data_table_out); $data_table_out .= "\n};\n"; # # Tables declarations # result : $declarations # my($declarations); $declarations = <{'tables'} = "$comment\n$index_out\n$positions_out\n$data_out\n$data_table_out"; $hsource->{'defines'} = $defines; $hsource->{'declarations'} = $declarations; spit("unac.c", $csource); spit("unac.h", $hsource); } main();