Dont strip diacritics from Hindi Devanagari characters, they are determinant to word meaning
This commit is contained in:
parent
97f9089709
commit
142c3be8de
3 changed files with 2836 additions and 990 deletions
|
@ -73,24 +73,35 @@ sub main {
|
|||
$ranges{$1}{$2} = $code_value;
|
||||
}
|
||||
|
||||
# For kana japanese characters, we don't want to strip accents as I'm
|
||||
# told that they are essential and stripping them does not
|
||||
# make sense. Wonder why Unicode does these decompositions
|
||||
# then... Problem: the first solution used was to decompose
|
||||
# the japanese accented kana and not remove accents. But then
|
||||
# Test for exceptions to unaccenting. Note that this is
|
||||
# mostly based on blocks when it should use the Unicode
|
||||
# script property. In practise, for the script concerned,
|
||||
# this does not look to be an issue currently
|
||||
# (following comment made for japanese but also concerns
|
||||
# other exceptions)
|
||||
# For kana japanese characters, we don't want to strip
|
||||
# accents as I'm told that they are essential and
|
||||
# stripping them does not make sense.
|
||||
# Problem: the first solution used was to decompose the
|
||||
# Japanese accented kana and not remove accents. But then
|
||||
# the unaccented character would match the string with
|
||||
# accent. So now we don't decompose at all, but this means
|
||||
# that, if the original text was decomposed, things don't work
|
||||
# as intended as we should actually recombine the
|
||||
# that, if the original text was decomposed, things don't
|
||||
# work as intended as we should actually recombine the
|
||||
# letter+accents in this case for data to be unified.
|
||||
|
||||
# Hiragana + Katakana
|
||||
if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff)
|
||||
# Halfwidth katakana
|
||||
&& !(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f)
|
||||
# Hindi Devanagari
|
||||
&& !(hex $code_value >= 0x0900 && hex $code_value <= 0x097f)
|
||||
&& !(hex $code_value >= 0xa8e0 && hex $code_value <= 0xa8ff)
|
||||
) {
|
||||
# If a decomposition exists, record it
|
||||
if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
|
||||
# Not for Hiragana + Katakana
|
||||
if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff) &&
|
||||
# and Halfwidth katakana
|
||||
!(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f) ) {
|
||||
$decomposition{$code_value} = $2;
|
||||
}
|
||||
}
|
||||
if($general_category =~ /^M/) {
|
||||
$mark{$code_value} = 1;
|
||||
# For mark caracters, we generate a 0 entry in the
|
||||
|
@ -102,6 +113,7 @@ sub main {
|
|||
# names have separate combining accent characters.
|
||||
$decomposition{$code_value} = "0000";
|
||||
}
|
||||
}
|
||||
$name{$code_value} = $character_name;
|
||||
}
|
||||
close(FILE);
|
||||
|
|
3472
unac/unac.c
3472
unac/unac.c
File diff suppressed because it is too large
Load diff
296
unac/unac.h
296
unac/unac.h
|
@ -32,10 +32,10 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
/* Generated by builder. Do not modify. Start defines */
|
||||
#define UNAC_BLOCK_SHIFT 4
|
||||
#define UNAC_BLOCK_SHIFT 3
|
||||
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
|
||||
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
|
||||
#define UNAC_BLOCK_COUNT 422
|
||||
#define UNAC_BLOCK_COUNT 714
|
||||
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
|
||||
/* Generated by builder. Do not modify. End defines */
|
||||
|
||||
|
@ -602,6 +602,298 @@ extern unsigned short unac_data418[];
|
|||
extern unsigned short unac_data419[];
|
||||
extern unsigned short unac_data420[];
|
||||
extern unsigned short unac_data421[];
|
||||
extern unsigned short unac_data422[];
|
||||
extern unsigned short unac_data423[];
|
||||
extern unsigned short unac_data424[];
|
||||
extern unsigned short unac_data425[];
|
||||
extern unsigned short unac_data426[];
|
||||
extern unsigned short unac_data427[];
|
||||
extern unsigned short unac_data428[];
|
||||
extern unsigned short unac_data429[];
|
||||
extern unsigned short unac_data430[];
|
||||
extern unsigned short unac_data431[];
|
||||
extern unsigned short unac_data432[];
|
||||
extern unsigned short unac_data433[];
|
||||
extern unsigned short unac_data434[];
|
||||
extern unsigned short unac_data435[];
|
||||
extern unsigned short unac_data436[];
|
||||
extern unsigned short unac_data437[];
|
||||
extern unsigned short unac_data438[];
|
||||
extern unsigned short unac_data439[];
|
||||
extern unsigned short unac_data440[];
|
||||
extern unsigned short unac_data441[];
|
||||
extern unsigned short unac_data442[];
|
||||
extern unsigned short unac_data443[];
|
||||
extern unsigned short unac_data444[];
|
||||
extern unsigned short unac_data445[];
|
||||
extern unsigned short unac_data446[];
|
||||
extern unsigned short unac_data447[];
|
||||
extern unsigned short unac_data448[];
|
||||
extern unsigned short unac_data449[];
|
||||
extern unsigned short unac_data450[];
|
||||
extern unsigned short unac_data451[];
|
||||
extern unsigned short unac_data452[];
|
||||
extern unsigned short unac_data453[];
|
||||
extern unsigned short unac_data454[];
|
||||
extern unsigned short unac_data455[];
|
||||
extern unsigned short unac_data456[];
|
||||
extern unsigned short unac_data457[];
|
||||
extern unsigned short unac_data458[];
|
||||
extern unsigned short unac_data459[];
|
||||
extern unsigned short unac_data460[];
|
||||
extern unsigned short unac_data461[];
|
||||
extern unsigned short unac_data462[];
|
||||
extern unsigned short unac_data463[];
|
||||
extern unsigned short unac_data464[];
|
||||
extern unsigned short unac_data465[];
|
||||
extern unsigned short unac_data466[];
|
||||
extern unsigned short unac_data467[];
|
||||
extern unsigned short unac_data468[];
|
||||
extern unsigned short unac_data469[];
|
||||
extern unsigned short unac_data470[];
|
||||
extern unsigned short unac_data471[];
|
||||
extern unsigned short unac_data472[];
|
||||
extern unsigned short unac_data473[];
|
||||
extern unsigned short unac_data474[];
|
||||
extern unsigned short unac_data475[];
|
||||
extern unsigned short unac_data476[];
|
||||
extern unsigned short unac_data477[];
|
||||
extern unsigned short unac_data478[];
|
||||
extern unsigned short unac_data479[];
|
||||
extern unsigned short unac_data480[];
|
||||
extern unsigned short unac_data481[];
|
||||
extern unsigned short unac_data482[];
|
||||
extern unsigned short unac_data483[];
|
||||
extern unsigned short unac_data484[];
|
||||
extern unsigned short unac_data485[];
|
||||
extern unsigned short unac_data486[];
|
||||
extern unsigned short unac_data487[];
|
||||
extern unsigned short unac_data488[];
|
||||
extern unsigned short unac_data489[];
|
||||
extern unsigned short unac_data490[];
|
||||
extern unsigned short unac_data491[];
|
||||
extern unsigned short unac_data492[];
|
||||
extern unsigned short unac_data493[];
|
||||
extern unsigned short unac_data494[];
|
||||
extern unsigned short unac_data495[];
|
||||
extern unsigned short unac_data496[];
|
||||
extern unsigned short unac_data497[];
|
||||
extern unsigned short unac_data498[];
|
||||
extern unsigned short unac_data499[];
|
||||
extern unsigned short unac_data500[];
|
||||
extern unsigned short unac_data501[];
|
||||
extern unsigned short unac_data502[];
|
||||
extern unsigned short unac_data503[];
|
||||
extern unsigned short unac_data504[];
|
||||
extern unsigned short unac_data505[];
|
||||
extern unsigned short unac_data506[];
|
||||
extern unsigned short unac_data507[];
|
||||
extern unsigned short unac_data508[];
|
||||
extern unsigned short unac_data509[];
|
||||
extern unsigned short unac_data510[];
|
||||
extern unsigned short unac_data511[];
|
||||
extern unsigned short unac_data512[];
|
||||
extern unsigned short unac_data513[];
|
||||
extern unsigned short unac_data514[];
|
||||
extern unsigned short unac_data515[];
|
||||
extern unsigned short unac_data516[];
|
||||
extern unsigned short unac_data517[];
|
||||
extern unsigned short unac_data518[];
|
||||
extern unsigned short unac_data519[];
|
||||
extern unsigned short unac_data520[];
|
||||
extern unsigned short unac_data521[];
|
||||
extern unsigned short unac_data522[];
|
||||
extern unsigned short unac_data523[];
|
||||
extern unsigned short unac_data524[];
|
||||
extern unsigned short unac_data525[];
|
||||
extern unsigned short unac_data526[];
|
||||
extern unsigned short unac_data527[];
|
||||
extern unsigned short unac_data528[];
|
||||
extern unsigned short unac_data529[];
|
||||
extern unsigned short unac_data530[];
|
||||
extern unsigned short unac_data531[];
|
||||
extern unsigned short unac_data532[];
|
||||
extern unsigned short unac_data533[];
|
||||
extern unsigned short unac_data534[];
|
||||
extern unsigned short unac_data535[];
|
||||
extern unsigned short unac_data536[];
|
||||
extern unsigned short unac_data537[];
|
||||
extern unsigned short unac_data538[];
|
||||
extern unsigned short unac_data539[];
|
||||
extern unsigned short unac_data540[];
|
||||
extern unsigned short unac_data541[];
|
||||
extern unsigned short unac_data542[];
|
||||
extern unsigned short unac_data543[];
|
||||
extern unsigned short unac_data544[];
|
||||
extern unsigned short unac_data545[];
|
||||
extern unsigned short unac_data546[];
|
||||
extern unsigned short unac_data547[];
|
||||
extern unsigned short unac_data548[];
|
||||
extern unsigned short unac_data549[];
|
||||
extern unsigned short unac_data550[];
|
||||
extern unsigned short unac_data551[];
|
||||
extern unsigned short unac_data552[];
|
||||
extern unsigned short unac_data553[];
|
||||
extern unsigned short unac_data554[];
|
||||
extern unsigned short unac_data555[];
|
||||
extern unsigned short unac_data556[];
|
||||
extern unsigned short unac_data557[];
|
||||
extern unsigned short unac_data558[];
|
||||
extern unsigned short unac_data559[];
|
||||
extern unsigned short unac_data560[];
|
||||
extern unsigned short unac_data561[];
|
||||
extern unsigned short unac_data562[];
|
||||
extern unsigned short unac_data563[];
|
||||
extern unsigned short unac_data564[];
|
||||
extern unsigned short unac_data565[];
|
||||
extern unsigned short unac_data566[];
|
||||
extern unsigned short unac_data567[];
|
||||
extern unsigned short unac_data568[];
|
||||
extern unsigned short unac_data569[];
|
||||
extern unsigned short unac_data570[];
|
||||
extern unsigned short unac_data571[];
|
||||
extern unsigned short unac_data572[];
|
||||
extern unsigned short unac_data573[];
|
||||
extern unsigned short unac_data574[];
|
||||
extern unsigned short unac_data575[];
|
||||
extern unsigned short unac_data576[];
|
||||
extern unsigned short unac_data577[];
|
||||
extern unsigned short unac_data578[];
|
||||
extern unsigned short unac_data579[];
|
||||
extern unsigned short unac_data580[];
|
||||
extern unsigned short unac_data581[];
|
||||
extern unsigned short unac_data582[];
|
||||
extern unsigned short unac_data583[];
|
||||
extern unsigned short unac_data584[];
|
||||
extern unsigned short unac_data585[];
|
||||
extern unsigned short unac_data586[];
|
||||
extern unsigned short unac_data587[];
|
||||
extern unsigned short unac_data588[];
|
||||
extern unsigned short unac_data589[];
|
||||
extern unsigned short unac_data590[];
|
||||
extern unsigned short unac_data591[];
|
||||
extern unsigned short unac_data592[];
|
||||
extern unsigned short unac_data593[];
|
||||
extern unsigned short unac_data594[];
|
||||
extern unsigned short unac_data595[];
|
||||
extern unsigned short unac_data596[];
|
||||
extern unsigned short unac_data597[];
|
||||
extern unsigned short unac_data598[];
|
||||
extern unsigned short unac_data599[];
|
||||
extern unsigned short unac_data600[];
|
||||
extern unsigned short unac_data601[];
|
||||
extern unsigned short unac_data602[];
|
||||
extern unsigned short unac_data603[];
|
||||
extern unsigned short unac_data604[];
|
||||
extern unsigned short unac_data605[];
|
||||
extern unsigned short unac_data606[];
|
||||
extern unsigned short unac_data607[];
|
||||
extern unsigned short unac_data608[];
|
||||
extern unsigned short unac_data609[];
|
||||
extern unsigned short unac_data610[];
|
||||
extern unsigned short unac_data611[];
|
||||
extern unsigned short unac_data612[];
|
||||
extern unsigned short unac_data613[];
|
||||
extern unsigned short unac_data614[];
|
||||
extern unsigned short unac_data615[];
|
||||
extern unsigned short unac_data616[];
|
||||
extern unsigned short unac_data617[];
|
||||
extern unsigned short unac_data618[];
|
||||
extern unsigned short unac_data619[];
|
||||
extern unsigned short unac_data620[];
|
||||
extern unsigned short unac_data621[];
|
||||
extern unsigned short unac_data622[];
|
||||
extern unsigned short unac_data623[];
|
||||
extern unsigned short unac_data624[];
|
||||
extern unsigned short unac_data625[];
|
||||
extern unsigned short unac_data626[];
|
||||
extern unsigned short unac_data627[];
|
||||
extern unsigned short unac_data628[];
|
||||
extern unsigned short unac_data629[];
|
||||
extern unsigned short unac_data630[];
|
||||
extern unsigned short unac_data631[];
|
||||
extern unsigned short unac_data632[];
|
||||
extern unsigned short unac_data633[];
|
||||
extern unsigned short unac_data634[];
|
||||
extern unsigned short unac_data635[];
|
||||
extern unsigned short unac_data636[];
|
||||
extern unsigned short unac_data637[];
|
||||
extern unsigned short unac_data638[];
|
||||
extern unsigned short unac_data639[];
|
||||
extern unsigned short unac_data640[];
|
||||
extern unsigned short unac_data641[];
|
||||
extern unsigned short unac_data642[];
|
||||
extern unsigned short unac_data643[];
|
||||
extern unsigned short unac_data644[];
|
||||
extern unsigned short unac_data645[];
|
||||
extern unsigned short unac_data646[];
|
||||
extern unsigned short unac_data647[];
|
||||
extern unsigned short unac_data648[];
|
||||
extern unsigned short unac_data649[];
|
||||
extern unsigned short unac_data650[];
|
||||
extern unsigned short unac_data651[];
|
||||
extern unsigned short unac_data652[];
|
||||
extern unsigned short unac_data653[];
|
||||
extern unsigned short unac_data654[];
|
||||
extern unsigned short unac_data655[];
|
||||
extern unsigned short unac_data656[];
|
||||
extern unsigned short unac_data657[];
|
||||
extern unsigned short unac_data658[];
|
||||
extern unsigned short unac_data659[];
|
||||
extern unsigned short unac_data660[];
|
||||
extern unsigned short unac_data661[];
|
||||
extern unsigned short unac_data662[];
|
||||
extern unsigned short unac_data663[];
|
||||
extern unsigned short unac_data664[];
|
||||
extern unsigned short unac_data665[];
|
||||
extern unsigned short unac_data666[];
|
||||
extern unsigned short unac_data667[];
|
||||
extern unsigned short unac_data668[];
|
||||
extern unsigned short unac_data669[];
|
||||
extern unsigned short unac_data670[];
|
||||
extern unsigned short unac_data671[];
|
||||
extern unsigned short unac_data672[];
|
||||
extern unsigned short unac_data673[];
|
||||
extern unsigned short unac_data674[];
|
||||
extern unsigned short unac_data675[];
|
||||
extern unsigned short unac_data676[];
|
||||
extern unsigned short unac_data677[];
|
||||
extern unsigned short unac_data678[];
|
||||
extern unsigned short unac_data679[];
|
||||
extern unsigned short unac_data680[];
|
||||
extern unsigned short unac_data681[];
|
||||
extern unsigned short unac_data682[];
|
||||
extern unsigned short unac_data683[];
|
||||
extern unsigned short unac_data684[];
|
||||
extern unsigned short unac_data685[];
|
||||
extern unsigned short unac_data686[];
|
||||
extern unsigned short unac_data687[];
|
||||
extern unsigned short unac_data688[];
|
||||
extern unsigned short unac_data689[];
|
||||
extern unsigned short unac_data690[];
|
||||
extern unsigned short unac_data691[];
|
||||
extern unsigned short unac_data692[];
|
||||
extern unsigned short unac_data693[];
|
||||
extern unsigned short unac_data694[];
|
||||
extern unsigned short unac_data695[];
|
||||
extern unsigned short unac_data696[];
|
||||
extern unsigned short unac_data697[];
|
||||
extern unsigned short unac_data698[];
|
||||
extern unsigned short unac_data699[];
|
||||
extern unsigned short unac_data700[];
|
||||
extern unsigned short unac_data701[];
|
||||
extern unsigned short unac_data702[];
|
||||
extern unsigned short unac_data703[];
|
||||
extern unsigned short unac_data704[];
|
||||
extern unsigned short unac_data705[];
|
||||
extern unsigned short unac_data706[];
|
||||
extern unsigned short unac_data707[];
|
||||
extern unsigned short unac_data708[];
|
||||
extern unsigned short unac_data709[];
|
||||
extern unsigned short unac_data710[];
|
||||
extern unsigned short unac_data711[];
|
||||
extern unsigned short unac_data712[];
|
||||
extern unsigned short unac_data713[];
|
||||
/* Generated by builder. Do not modify. End declarations */
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue