Dont strip diacritics from Hindi Devanagari characters, they are determinant to word meaning

This commit is contained in:
medoc 2013-10-26 18:56:25 +02:00
parent 97f9089709
commit 142c3be8de
3 changed files with 2836 additions and 990 deletions

View file

@ -73,24 +73,35 @@ sub main {
$ranges{$1}{$2} = $code_value;
}
# For kana japanese characters, we don't want to strip accents as I'm
# told that they are essential and stripping them does not
# make sense. Wonder why Unicode does these decompositions
# then... Problem: the first solution used was to decompose
# the japanese accented kana and not remove accents. But then
# Test for exceptions to unaccenting. Note that this is
# mostly based on blocks when it should use the Unicode
# script property. In practise, for the script concerned,
# this does not look to be an issue currently
# (following comment made for japanese but also concerns
# other exceptions)
# For kana japanese characters, we don't want to strip
# accents as I'm told that they are essential and
# stripping them does not make sense.
# Problem: the first solution used was to decompose the
# Japanese accented kana and not remove accents. But then
# the unaccented character would match the string with
# accent. So now we don't decompose at all, but this means
# that, if the original text was decomposed, things don't work
# as intended as we should actually recombine the
# that, if the original text was decomposed, things don't
# work as intended as we should actually recombine the
# letter+accents in this case for data to be unified.
# Hiragana + Katakana
if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff)
# Halfwidth katakana
&& !(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f)
# Hindi Devanagari
&& !(hex $code_value >= 0x0900 && hex $code_value <= 0x097f)
&& !(hex $code_value >= 0xa8e0 && hex $code_value <= 0xa8ff)
) {
# If a decomposition exists, record it
if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
# Not for Hiragana + Katakana
if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff) &&
# and Halfwidth katakana
!(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f) ) {
$decomposition{$code_value} = $2;
}
}
if($general_category =~ /^M/) {
$mark{$code_value} = 1;
# For mark caracters, we generate a 0 entry in the
@ -102,6 +113,7 @@ sub main {
# names have separate combining accent characters.
$decomposition{$code_value} = "0000";
}
}
$name{$code_value} = $character_name;
}
close(FILE);

File diff suppressed because it is too large Load diff

View file

@ -32,10 +32,10 @@ extern "C" {
#endif
/* Generated by builder. Do not modify. Start defines */
#define UNAC_BLOCK_SHIFT 4
#define UNAC_BLOCK_SHIFT 3
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
#define UNAC_BLOCK_COUNT 422
#define UNAC_BLOCK_COUNT 714
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
/* Generated by builder. Do not modify. End defines */
@ -602,6 +602,298 @@ extern unsigned short unac_data418[];
extern unsigned short unac_data419[];
extern unsigned short unac_data420[];
extern unsigned short unac_data421[];
extern unsigned short unac_data422[];
extern unsigned short unac_data423[];
extern unsigned short unac_data424[];
extern unsigned short unac_data425[];
extern unsigned short unac_data426[];
extern unsigned short unac_data427[];
extern unsigned short unac_data428[];
extern unsigned short unac_data429[];
extern unsigned short unac_data430[];
extern unsigned short unac_data431[];
extern unsigned short unac_data432[];
extern unsigned short unac_data433[];
extern unsigned short unac_data434[];
extern unsigned short unac_data435[];
extern unsigned short unac_data436[];
extern unsigned short unac_data437[];
extern unsigned short unac_data438[];
extern unsigned short unac_data439[];
extern unsigned short unac_data440[];
extern unsigned short unac_data441[];
extern unsigned short unac_data442[];
extern unsigned short unac_data443[];
extern unsigned short unac_data444[];
extern unsigned short unac_data445[];
extern unsigned short unac_data446[];
extern unsigned short unac_data447[];
extern unsigned short unac_data448[];
extern unsigned short unac_data449[];
extern unsigned short unac_data450[];
extern unsigned short unac_data451[];
extern unsigned short unac_data452[];
extern unsigned short unac_data453[];
extern unsigned short unac_data454[];
extern unsigned short unac_data455[];
extern unsigned short unac_data456[];
extern unsigned short unac_data457[];
extern unsigned short unac_data458[];
extern unsigned short unac_data459[];
extern unsigned short unac_data460[];
extern unsigned short unac_data461[];
extern unsigned short unac_data462[];
extern unsigned short unac_data463[];
extern unsigned short unac_data464[];
extern unsigned short unac_data465[];
extern unsigned short unac_data466[];
extern unsigned short unac_data467[];
extern unsigned short unac_data468[];
extern unsigned short unac_data469[];
extern unsigned short unac_data470[];
extern unsigned short unac_data471[];
extern unsigned short unac_data472[];
extern unsigned short unac_data473[];
extern unsigned short unac_data474[];
extern unsigned short unac_data475[];
extern unsigned short unac_data476[];
extern unsigned short unac_data477[];
extern unsigned short unac_data478[];
extern unsigned short unac_data479[];
extern unsigned short unac_data480[];
extern unsigned short unac_data481[];
extern unsigned short unac_data482[];
extern unsigned short unac_data483[];
extern unsigned short unac_data484[];
extern unsigned short unac_data485[];
extern unsigned short unac_data486[];
extern unsigned short unac_data487[];
extern unsigned short unac_data488[];
extern unsigned short unac_data489[];
extern unsigned short unac_data490[];
extern unsigned short unac_data491[];
extern unsigned short unac_data492[];
extern unsigned short unac_data493[];
extern unsigned short unac_data494[];
extern unsigned short unac_data495[];
extern unsigned short unac_data496[];
extern unsigned short unac_data497[];
extern unsigned short unac_data498[];
extern unsigned short unac_data499[];
extern unsigned short unac_data500[];
extern unsigned short unac_data501[];
extern unsigned short unac_data502[];
extern unsigned short unac_data503[];
extern unsigned short unac_data504[];
extern unsigned short unac_data505[];
extern unsigned short unac_data506[];
extern unsigned short unac_data507[];
extern unsigned short unac_data508[];
extern unsigned short unac_data509[];
extern unsigned short unac_data510[];
extern unsigned short unac_data511[];
extern unsigned short unac_data512[];
extern unsigned short unac_data513[];
extern unsigned short unac_data514[];
extern unsigned short unac_data515[];
extern unsigned short unac_data516[];
extern unsigned short unac_data517[];
extern unsigned short unac_data518[];
extern unsigned short unac_data519[];
extern unsigned short unac_data520[];
extern unsigned short unac_data521[];
extern unsigned short unac_data522[];
extern unsigned short unac_data523[];
extern unsigned short unac_data524[];
extern unsigned short unac_data525[];
extern unsigned short unac_data526[];
extern unsigned short unac_data527[];
extern unsigned short unac_data528[];
extern unsigned short unac_data529[];
extern unsigned short unac_data530[];
extern unsigned short unac_data531[];
extern unsigned short unac_data532[];
extern unsigned short unac_data533[];
extern unsigned short unac_data534[];
extern unsigned short unac_data535[];
extern unsigned short unac_data536[];
extern unsigned short unac_data537[];
extern unsigned short unac_data538[];
extern unsigned short unac_data539[];
extern unsigned short unac_data540[];
extern unsigned short unac_data541[];
extern unsigned short unac_data542[];
extern unsigned short unac_data543[];
extern unsigned short unac_data544[];
extern unsigned short unac_data545[];
extern unsigned short unac_data546[];
extern unsigned short unac_data547[];
extern unsigned short unac_data548[];
extern unsigned short unac_data549[];
extern unsigned short unac_data550[];
extern unsigned short unac_data551[];
extern unsigned short unac_data552[];
extern unsigned short unac_data553[];
extern unsigned short unac_data554[];
extern unsigned short unac_data555[];
extern unsigned short unac_data556[];
extern unsigned short unac_data557[];
extern unsigned short unac_data558[];
extern unsigned short unac_data559[];
extern unsigned short unac_data560[];
extern unsigned short unac_data561[];
extern unsigned short unac_data562[];
extern unsigned short unac_data563[];
extern unsigned short unac_data564[];
extern unsigned short unac_data565[];
extern unsigned short unac_data566[];
extern unsigned short unac_data567[];
extern unsigned short unac_data568[];
extern unsigned short unac_data569[];
extern unsigned short unac_data570[];
extern unsigned short unac_data571[];
extern unsigned short unac_data572[];
extern unsigned short unac_data573[];
extern unsigned short unac_data574[];
extern unsigned short unac_data575[];
extern unsigned short unac_data576[];
extern unsigned short unac_data577[];
extern unsigned short unac_data578[];
extern unsigned short unac_data579[];
extern unsigned short unac_data580[];
extern unsigned short unac_data581[];
extern unsigned short unac_data582[];
extern unsigned short unac_data583[];
extern unsigned short unac_data584[];
extern unsigned short unac_data585[];
extern unsigned short unac_data586[];
extern unsigned short unac_data587[];
extern unsigned short unac_data588[];
extern unsigned short unac_data589[];
extern unsigned short unac_data590[];
extern unsigned short unac_data591[];
extern unsigned short unac_data592[];
extern unsigned short unac_data593[];
extern unsigned short unac_data594[];
extern unsigned short unac_data595[];
extern unsigned short unac_data596[];
extern unsigned short unac_data597[];
extern unsigned short unac_data598[];
extern unsigned short unac_data599[];
extern unsigned short unac_data600[];
extern unsigned short unac_data601[];
extern unsigned short unac_data602[];
extern unsigned short unac_data603[];
extern unsigned short unac_data604[];
extern unsigned short unac_data605[];
extern unsigned short unac_data606[];
extern unsigned short unac_data607[];
extern unsigned short unac_data608[];
extern unsigned short unac_data609[];
extern unsigned short unac_data610[];
extern unsigned short unac_data611[];
extern unsigned short unac_data612[];
extern unsigned short unac_data613[];
extern unsigned short unac_data614[];
extern unsigned short unac_data615[];
extern unsigned short unac_data616[];
extern unsigned short unac_data617[];
extern unsigned short unac_data618[];
extern unsigned short unac_data619[];
extern unsigned short unac_data620[];
extern unsigned short unac_data621[];
extern unsigned short unac_data622[];
extern unsigned short unac_data623[];
extern unsigned short unac_data624[];
extern unsigned short unac_data625[];
extern unsigned short unac_data626[];
extern unsigned short unac_data627[];
extern unsigned short unac_data628[];
extern unsigned short unac_data629[];
extern unsigned short unac_data630[];
extern unsigned short unac_data631[];
extern unsigned short unac_data632[];
extern unsigned short unac_data633[];
extern unsigned short unac_data634[];
extern unsigned short unac_data635[];
extern unsigned short unac_data636[];
extern unsigned short unac_data637[];
extern unsigned short unac_data638[];
extern unsigned short unac_data639[];
extern unsigned short unac_data640[];
extern unsigned short unac_data641[];
extern unsigned short unac_data642[];
extern unsigned short unac_data643[];
extern unsigned short unac_data644[];
extern unsigned short unac_data645[];
extern unsigned short unac_data646[];
extern unsigned short unac_data647[];
extern unsigned short unac_data648[];
extern unsigned short unac_data649[];
extern unsigned short unac_data650[];
extern unsigned short unac_data651[];
extern unsigned short unac_data652[];
extern unsigned short unac_data653[];
extern unsigned short unac_data654[];
extern unsigned short unac_data655[];
extern unsigned short unac_data656[];
extern unsigned short unac_data657[];
extern unsigned short unac_data658[];
extern unsigned short unac_data659[];
extern unsigned short unac_data660[];
extern unsigned short unac_data661[];
extern unsigned short unac_data662[];
extern unsigned short unac_data663[];
extern unsigned short unac_data664[];
extern unsigned short unac_data665[];
extern unsigned short unac_data666[];
extern unsigned short unac_data667[];
extern unsigned short unac_data668[];
extern unsigned short unac_data669[];
extern unsigned short unac_data670[];
extern unsigned short unac_data671[];
extern unsigned short unac_data672[];
extern unsigned short unac_data673[];
extern unsigned short unac_data674[];
extern unsigned short unac_data675[];
extern unsigned short unac_data676[];
extern unsigned short unac_data677[];
extern unsigned short unac_data678[];
extern unsigned short unac_data679[];
extern unsigned short unac_data680[];
extern unsigned short unac_data681[];
extern unsigned short unac_data682[];
extern unsigned short unac_data683[];
extern unsigned short unac_data684[];
extern unsigned short unac_data685[];
extern unsigned short unac_data686[];
extern unsigned short unac_data687[];
extern unsigned short unac_data688[];
extern unsigned short unac_data689[];
extern unsigned short unac_data690[];
extern unsigned short unac_data691[];
extern unsigned short unac_data692[];
extern unsigned short unac_data693[];
extern unsigned short unac_data694[];
extern unsigned short unac_data695[];
extern unsigned short unac_data696[];
extern unsigned short unac_data697[];
extern unsigned short unac_data698[];
extern unsigned short unac_data699[];
extern unsigned short unac_data700[];
extern unsigned short unac_data701[];
extern unsigned short unac_data702[];
extern unsigned short unac_data703[];
extern unsigned short unac_data704[];
extern unsigned short unac_data705[];
extern unsigned short unac_data706[];
extern unsigned short unac_data707[];
extern unsigned short unac_data708[];
extern unsigned short unac_data709[];
extern unsigned short unac_data710[];
extern unsigned short unac_data711[];
extern unsigned short unac_data712[];
extern unsigned short unac_data713[];
/* Generated by builder. Do not modify. End declarations */
#ifdef __cplusplus