merged the case/diac sensitivity code back into trunk

This commit is contained in:
Jean-Francois Dockes 2012-09-25 19:20:24 +02:00
commit 94b571aac6
22 changed files with 743 additions and 271 deletions

View file

@ -21,7 +21,10 @@
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <set> //#include <set>
#include <tr1/unordered_set>
using std::tr1::unordered_set;
#include <cstring> #include <cstring>
#include "textsplit.h" #include "textsplit.h"
@ -57,8 +60,8 @@ static int charclasses[charclasses_size];
// with interesting properties. This is far from full-blown management // with interesting properties. This is far from full-blown management
// of Unicode properties, but seems to do the job well enough in most // of Unicode properties, but seems to do the job well enough in most
// common cases // common cases
static set<unsigned int> unicign; static unordered_set<unsigned int> unicign;
static set<unsigned int> visiblewhite; static unordered_set<unsigned int> visiblewhite;
class CharClassInit { class CharClassInit {
public: public:

View file

@ -25,24 +25,8 @@
* This is used as a quick fix to the ascii-based code, and is not correct. * This is used as a quick fix to the ascii-based code, and is not correct.
* the correct way would be to do what http://www.unicode.org/reports/tr29/ * the correct way would be to do what http://www.unicode.org/reports/tr29/
* says. * says.
*
* Data from:
# PropList-4.0.1.txt
# Date: 2004-03-02, 02:42:40 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2004 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see UCD.html
*/ */
static const unsigned int uniign[] = { static const unsigned int uniign[] = {
0x0021, /* ; Terminal_Punctuation # Po EXCLAMATION MARK*/
0x002C, /* ; Terminal_Punctuation # Po COMMA*/
0x002D, /* ; Dash # Pd HYPHEN-MINUS*/
0x002E, /* ; Terminal_Punctuation # Po FULL STOP*/
0x003A, /* ; Terminal_Punctuation # Po [2] COLON..SEMICOLON*/
0x003B, /* ; Terminal_Punctuation # Po [2] COLON..SEMICOLON*/
0x003F, /* ; Terminal_Punctuation # Po QUESTION MARK*/
0x0085, /* NEXT LINE NEL;Cc */ 0x0085, /* NEXT LINE NEL;Cc */
0x00A0, /* NO-BREAK SPACE; Zs */ 0x00A0, /* NO-BREAK SPACE; Zs */
0x00A1, /* INVERTED EXCLAMATION MARK;Po */ 0x00A1, /* INVERTED EXCLAMATION MARK;Po */
@ -53,85 +37,81 @@ static const unsigned int uniign[] = {
0x00A6, /* BROKEN BAR;So */ 0x00A6, /* BROKEN BAR;So */
0x00A7, /* SECTION SIGN;So; */ 0x00A7, /* SECTION SIGN;So; */
0x00A9, /* COPYRIGHT SIGN;So */ 0x00A9, /* COPYRIGHT SIGN;So */
0x00AB, /* ; Quotation_Mark # Pi LEFT-POINTING DOUBLE ANGLE QUOTATION MARK*/ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK*/
0x00AC, /* NOT SIGN;Sm */ 0x00AC, /* NOT SIGN;Sm */
0x00AD, /* ; Hyphen # Cf SOFT HYPHEN*/ 0x00AD, /* SOFT HYPHEN*/
0x00AE, /* registered sign */ 0x00AE, /* registered sign */
0x00B0, /* DEGREE SIGN;So;0;ET;;;;;N;;;;; */ 0x00B0, /* DEGREE SIGN */
0x00B1, /* PLUS-MINUS SIGN;Sm;0;ET;;;;;N;PLUS-OR-MINUS SIGN;;;;*/ 0x00B1, /* PLUS-MINUS SIGN */
0x00B7, /* MIDDLE DOT;Po;0;ON;;;;;N;;;;;*/ 0x00B7, /* MIDDLE DOT */
0x00BB, /* ; Quotation_Mark # Pf RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK*/ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
0x00BF, /* INVERTED QUESTION MARK;Po */ 0x00BF, /* INVERTED QUESTION MARK; */
0x00D7, /* MULTIPLICATION SIGN;Sm;0;ON;;;;;N;;;;; */ 0x00D7, /* MULTIPLICATION SIGN */
0x037E, /* ; Terminal_Punctuation # Po GREEK QUESTION MARK*/ 0x037E, /* GREEK QUESTION MARK */
0x0387, /* ; Terminal_Punctuation # Po GREEK ANO TELEIA*/ 0x0387, /* GREEK ANO TELEIA */
0x055C, /* ; STerm # Po ARMENIAN EXCLAMATION MARK*/ 0x055C, /* ARMENIAN EXCLAMATION MARK */
0x055E, /* ; STerm # Po ARMENIAN QUESTION MARK*/ 0x055E, /* ARMENIAN QUESTION MARK */
0x0589, /* ; STerm # Po ARMENIAN FULL STOP*/ 0x0589, /* ARMENIAN FULL STOP */
0x0589, /* ; Terminal_Punctuation # Po ARMENIAN FULL STOP*/ 0x058A, /* ARMENIAN HYPHEN */
0x058A, /* ; Dash # Pd ARMENIAN HYPHEN*/ 0x05C3, /* HEBREW PUNCTUATION SOF PASUQ */
0x058A, /* ; Hyphen # Pd ARMENIAN HYPHEN*/ 0x060C, /* ARABIC COMMA */
0x05C3, /* ; Terminal_Punctuation # Po HEBREW PUNCTUATION SOF PASUQ*/ 0x061B, /* ARABIC SEMICOLON */
0x060C, /* ; Terminal_Punctuation # Po ARABIC COMMA*/ 0x061F, /* ARABIC QUESTION MARK */
0x061B, /* ; Terminal_Punctuation # Po ARABIC SEMICOLON*/ 0x06D4, /* ARABIC FULL STOP */
0x061F, /* ; STerm # Po ARABIC QUESTION MARK*/ 0x166E, /* CANADIAN SYLLABICS FULL STOP */
0x061F, /* ; Terminal_Punctuation # Po ARABIC QUESTION MARK*/ 0x1680, /* OGHAM SPACE MARK */
0x06D4, /* ; STerm # Po ARABIC FULL STOP*/ 0x16EB, /* RUNIC SINGLE PUNCTUATION */
0x06D4, /* ; Terminal_Punctuation # Po ARABIC FULL STOP*/ 0x16EC, /* RUNIC MULTIPLE PUNCTUATION */
0x166E, /* ; STerm # Po CANADIAN SYLLABICS FULL STOP*/ 0x16ED, /* RUNIC CROSS PUNCTUATION */
0x1680, /* ; White_Space # Zs OGHAM SPACE MARK*/ 0x1803, /* MONGOLIAN FULL STOP */
0x16EB, /* RUNIC SINGLE PUNCTUATION;Po;0;L;;;;;N;;;;;*/ 0x1806, /* MONGOLIAN TODO SOFT HYPHEN */
0x16EC, /* RUNIC MULTIPLE PUNCTUATION;Po;0;L;;;;;N;;;;;*/ 0x1809, /* MONGOLIAN MANCHU FULL STOP */
0x16ED, /* RUNIC CROSS PUNCTUATION;Po;0;L;;;;;N;;;;; */ 0x180E, /* MONGOLIAN VOWEL SEPARATOR */
0x1803, /* ; STerm # Po MONGOLIAN FULL STOP*/ 0x2000, /* EN QUAD..HAIR SPACE*/
0x1806, /* ; Hyphen # Pd MONGOLIAN TODO SOFT HYPHEN*/ 0x2001, /* EN QUAD..HAIR SPACE*/
0x1809, /* ; STerm # Po MONGOLIAN MANCHU FULL STOP*/ 0x2002, /* EN QUAD..HAIR SPACE*/
0x180E, /* ; White_Space # Zs MONGOLIAN VOWEL SEPARATOR*/ 0x2003, /* EN QUAD..HAIR SPACE*/
0x2000, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ 0x2004, /* EN QUAD..HAIR SPACE*/
0x2001, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ 0x2005, /* EN QUAD..HAIR SPACE*/
0x2002, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ 0x2006, /* EN QUAD..HAIR SPACE*/
0x2003, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ 0x2007, /* EN QUAD..HAIR SPACE*/
0x2004, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ 0x2008, /* EN QUAD..HAIR SPACE*/
0x2005, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ 0x2009, /* EN QUAD..HAIR SPACE*/
0x2006, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ 0x200A, /* EN QUAD..HAIR SPACE*/
0x2007, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ 0x2010, /* [2] HYPHEN..NON-BREAKING HYPHEN*/
0x2008, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ 0x2011, /* [2] HYPHEN..NON-BREAKING HYPHEN*/
0x2009, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ 0x2012, /* [6] HYPHEN..HORIZONTAL BAR*/
0x200A, /* ; White_Space # Zs [11] EN QUAD..HAIR SPACE*/ 0x2013, /* [6] HYPHEN..HORIZONTAL BAR*/
0x2010, /* ; Hyphen # Pd [2] HYPHEN..NON-BREAKING HYPHEN*/ 0x2014, /* [6] HYPHEN..HORIZONTAL BAR*/
0x2011, /* ; Hyphen # Pd [2] HYPHEN..NON-BREAKING HYPHEN*/ 0x2015, /* [6] HYPHEN..HORIZONTAL BAR*/
0x2012, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/ 0x2018, /* LEFT SINGLE QUOTATION MARK*/
0x2013, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/ 0x2019, /* RIGHT SINGLE QUOTATION MARK*/
0x2014, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/ 0x201A, /* SINGLE LOW-9 QUOTATION MARK*/
0x2015, /* ; Dash # Pd [6] HYPHEN..HORIZONTAL BAR*/ 0x201B, /* SINGLE HIGH-REVERSED-9 QUOTATION MARK*/
0x2018, /* ; Quotation_Mark # Pi LEFT SINGLE QUOTATION MARK*/ 0x201C, /* LEFT DOUBLE QUOTATION MARK*/
0x2019, /* ; Quotation_Mark # Pf RIGHT SINGLE QUOTATION MARK*/ 0x201D, /* RIGHT DOUBLE QUOTATION MARK*/
0x201A, /* ; Quotation_Mark # Ps SINGLE LOW-9 QUOTATION MARK*/ 0x201E, /* DOUBLE LOW-9 QUOTATION MARK*/
0x201B, /* ; Quotation_Mark # Pi SINGLE HIGH-REVERSED-9 QUOTATION MARK*/ 0x201F, /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK*/
0x201C, /* ; Quotation_Mark # Pi LEFT DOUBLE QUOTATION MARK*/ 0x2022, /* BULLET */
0x201D, /* ; Quotation_Mark # Pf RIGHT DOUBLE QUOTATION MARK*/ 0x2023, /* TRIANGULAR BULLET*/
0x201E, /* ; Quotation_Mark # Ps DOUBLE LOW-9 QUOTATION MARK*/
0x201F, /* ; Quotation_Mark # Pi DOUBLE HIGH-REVERSED-9 QUOTATION MARK*/
0x2022, /* BULLET;Po;0;ON;;;;;N;;;;; */
0x2023, /* TRIANGULAR BULLET;Po;0;ON;;;;;N;;;;;*/
0x2024, /* ONE DOT LEADER;Po;0;ON;<compat> 002E;;;;N;;;;;*/ 0x2024, /* ONE DOT LEADER;Po;0;ON;<compat> 002E;;;;N;;;;;*/
0x2025, /* TWO DOT LEADER;Po;0;ON;<compat> 002E 002E;;;;N;;;;; */ 0x2025, /* TWO DOT LEADER;Po;0;ON;<compat> 002E 002E;;;;N;;;;; */
0x2026, /* HORIZONTAL ELLIPSIS;Po;0;ON;<compat> 002E 002E 002E;;;;N;;;;; */ 0x2026, /* HORIZONTAL ELLIPSIS;Po;0;ON;<compat> 002E 002E 002E;;;;N;;;;; */
0x2028, /* ; White_Space # Zl LINE SEPARATOR*/ 0x2028, /* LINE SEPARATOR */
0x2029, /* ; White_Space # Zp PARAGRAPH SEPARATOR*/ 0x2029, /* PARAGRAPH SEPARATOR */
0x202F, /* ; White_Space # Zs NARROW NO-BREAK SPACE*/ 0x202F, /* NARROW NO-BREAK SPACE */
0x2032, /* PRIME;Po;0;ET;;;;;N;;;;;*/ 0x2032, /* PRIME */
0x2039, /* ; Quotation_Mark # Pi SINGLE LEFT-POINTING ANGLE QUOTATION MARK*/ 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
0x203A, /* ; Quotation_Mark # Pf SINGLE RIGHT-POINTING ANGLE QUOTATION MARK*/ 0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK*/
0x203C, /* ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/ 0x203C, /* [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/
0x203D, /* ; STerm # Po [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/ 0x203D, /* [2] DOUBLE EXCLAMATION MARK..INTERROBANG*/
0x2047, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/ 0x2047, /* [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
0x2048, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/ 0x2048, /* [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
0x2049, /* ; Terminal_Punctuation # Po [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/ 0x2049, /* [3] DOUBLE QUESTION MARK..EXCLAMATION QUESTION MARK*/
0x2053, /* ; Dash # Po SWUNG DASH*/ 0x2053, /* SWUNG DASH*/
0x205F, /* ; White_Space # Zs MEDIUM MATHEMATICAL SPACE*/ 0x205F, /* MEDIUM MATHEMATICAL SPACE*/
0x207B, /* ; Dash # Sm SUPERSCRIPT MINUS*/ 0x207B, /* SUPERSCRIPT MINUS*/
0x208B, /* ; Dash # Sm SUBSCRIPT MINUS*/ 0x208B, /* SUBSCRIPT MINUS*/
0x20A0, /* EURO-CURRENCY SIGN */ 0x20A0, /* EURO-CURRENCY SIGN */
0x20A1, /* COLON SIGN */ 0x20A1, /* COLON SIGN */
0x20A2, /* CRUZEIRO SIGN */ 0x20A2, /* CRUZEIRO SIGN */
@ -161,60 +141,156 @@ static const unsigned int uniign[] = {
0x2117, /* SOUND RECORDING COPYRIGHT;So */ 0x2117, /* SOUND RECORDING COPYRIGHT;So */
0x2122, /* TRADE MARK SIGN;So; */ 0x2122, /* TRADE MARK SIGN;So; */
0x2192, /* RIGHTWARDS ARROW;Sm;0;ON;;;;;N;RIGHT ARROW;;;;*/ 0x2192, /* RIGHTWARDS ARROW;Sm;0;ON;;;;;N;RIGHT ARROW;;;;*/
0x2212, /* ; Dash # Sm MINUS SIGN*/ 0x2212, /* MINUS SIGN*/
0x25A0, /* BLACK SQUARE */
0x25A1, /* WHITE SQUARE */
0x25A2, /* WHITE SQUARE WITH ROUNDED CORNERS */
0x25A3, /* WHITE SQUARE CONTAINING BLACK SMALL SQUARE */
0x25A4, /* SQUARE WITH HORIZONTAL FILL */
0x25A5, /* SQUARE WITH VERTICAL FILL */
0x25A6, /* SQUARE WITH ORTHOGONAL CROSSHATCH FILL */
0x25A7, /* SQUARE WITH UPPER LEFT TO LOWER RIGHT FILL */
0x25A8, /* SQUARE WITH UPPER RIGHT TO LOWER LEFT FILL */
0x25A9, /* SQUARE WITH DIAGONAL CROSSHATCH FILL */
0x25AA, /* BLACK SMALL SQUARE */
0x25AB, /* WHITE SMALL SQUARE */
0x25AC, /* BLACK RECTANGLE */
0x25AD, /* WHITE RECTANGLE */
0x25AE, /* BLACK VERTICAL RECTANGLE */
0x25AF, /* WHITE VERTICAL RECTANGLE */
0x25B0, /* BLACK PARALLELOGRAM */
0x25B1, /* WHITE PARALLELOGRAM */
0x25B2, /* BLACK UP-POINTING TRIANGLE */
0x25B3, /* WHITE UP-POINTING TRIANGLE */
0x25B4, /* BLACK UP-POINTING SMALL TRIANGLE */
0x25B5, /* WHITE UP-POINTING SMALL TRIANGLE */
0x25B6, /* BLACK RIGHT-POINTING TRIANGLE */
0x25B7, /* WHITE RIGHT-POINTING TRIANGLE */
0x25B8, /* BLACK RIGHT-POINTING SMALL TRIANGLE */
0x25B9, /* WHITE RIGHT-POINTING SMALL TRIANGLE */
0x25BA, /* BLACK RIGHT-POINTING POINTER */
0x25BB, /* WHITE RIGHT-POINTING POINTER */
0x25BC, /* BLACK DOWN-POINTING TRIANGLE */
0x25BD, /* WHITE DOWN-POINTING TRIANGLE */
0x25BE, /* BLACK DOWN-POINTING SMALL TRIANGLE */
0x25BF, /* WHITE DOWN-POINTING SMALL TRIANGLE */
0x25C0, /* BLACK LEFT-POINTING TRIANGLE */
0x25C1, /* WHITE LEFT-POINTING TRIANGLE */
0x25C2, /* BLACK LEFT-POINTING SMALL TRIANGLE */
0x25C3, /* WHITE LEFT-POINTING SMALL TRIANGLE */
0x25C4, /* BLACK LEFT-POINTING POINTER */
0x25C5, /* WHITE LEFT-POINTING POINTER */
0x25C6, /* BLACK DIAMOND */
0x25C7, /* WHITE DIAMOND */
0x25C8, /* WHITE DIAMOND CONTAINING BLACK SMALL DIAMOND */
0x25C9, /* FISHEYE */
0x25CA, /* LOZENGE */
0x25CB, /* WHITE CIRCLE */
0x25CC, /* DOTTED CIRCLE */
0x25CD, /* CIRCLE WITH VERTICAL FILL */
0x25CE, /* BULLSEYE */
0x25CF, /* BLACK CIRCLE */
0x25D0, /* CIRCLE WITH LEFT HALF BLACK */
0x25D1, /* CIRCLE WITH RIGHT HALF BLACK */
0x25D2, /* CIRCLE WITH LOWER HALF BLACK */
0x25D3, /* CIRCLE WITH UPPER HALF BLACK */
0x25D4, /* CIRCLE WITH UPPER RIGHT QUADRANT BLACK */
0x25D5, /* CIRCLE WITH ALL BUT UPPER LEFT QUADRANT BLACK */
0x25D6, /* LEFT HALF BLACK CIRCLE */
0x25D7, /* RIGHT HALF BLACK CIRCLE */
0x25D8, /* INVERSE BULLET */
0x25D9, /* INVERSE WHITE CIRCLE */
0x25DA, /* UPPER HALF INVERSE WHITE CIRCLE */
0x25DB, /* LOWER HALF INVERSE WHITE CIRCLE */
0x25DC, /* UPPER LEFT QUADRANT CIRCULAR ARC */
0x25DD, /* UPPER RIGHT QUADRANT CIRCULAR ARC */
0x25DE, /* LOWER RIGHT QUADRANT CIRCULAR ARC */
0x25DF, /* LOWER LEFT QUADRANT CIRCULAR ARC */
0x25E0, /* UPPER HALF CIRCLE */
0x25E1, /* LOWER HALF CIRCLE */
0x25E2, /* BLACK LOWER RIGHT TRIANGLE */
0x25E3, /* BLACK LOWER LEFT TRIANGLE */
0x25E4, /* BLACK UPPER LEFT TRIANGLE */
0x25E5, /* BLACK UPPER RIGHT TRIANGLE */
0x25E6, /* WHITE BULLET */
0x25E7, /* SQUARE WITH LEFT HALF BLACK */
0x25E8, /* SQUARE WITH RIGHT HALF BLACK */
0x25E9, /* SQUARE WITH UPPER LEFT DIAGONAL HALF BLACK */
0x25EA, /* SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK */
0x25EB, /* WHITE SQUARE WITH VERTICAL BISECTING LINE */
0x25EC, /* WHITE UP-POINTING TRIANGLE WITH DOT */
0x25ED, /* UP-POINTING TRIANGLE WITH LEFT HALF BLACK */
0x25EE, /* UP-POINTING TRIANGLE WITH RIGHT HALF BLACK */
0x25EF, /* LARGE CIRCLE */
0x25F0, /* WHITE SQUARE WITH UPPER LEFT QUADRANT */
0x25F1, /* WHITE SQUARE WITH LOWER LEFT QUADRANT */
0x25F2, /* WHITE SQUARE WITH LOWER RIGHT QUADRANT */
0x25F3, /* WHITE SQUARE WITH UPPER RIGHT QUADRANT */
0x25F4, /* WHITE CIRCLE WITH UPPER LEFT QUADRANT */
0x25F5, /* WHITE CIRCLE WITH LOWER LEFT QUADRANT */
0x25F6, /* WHITE CIRCLE WITH LOWER RIGHT QUADRANT */
0x25F7, /* WHITE CIRCLE WITH UPPER RIGHT QUADRANT */
0x25F8, /* UPPER LEFT TRIANGLE */
0x25F9, /* UPPER RIGHT TRIANGLE */
0x25FA, /* LOWER LEFT TRIANGLE */
0x25FB, /* WHITE MEDIUM SQUARE */
0x25FC, /* BLACK MEDIUM SQUARE */
0x25FD, /* WHITE MEDIUM SMALL SQUARE */
0x25FE, /* BLACK MEDIUM SMALL SQUARE */
0x25FF, /* LOWER RIGHT TRIANGLE */
0x2E2E, /* REVERSED QUESTION MARK;Po;0;ON;;;;;N;;;;; */ 0x2E2E, /* REVERSED QUESTION MARK;Po;0;ON;;;;;N;;;;; */
0x3000, /* ; White_Space # Zs IDEOGRAPHIC SPACE*/ 0x3000, /* IDEOGRAPHIC SPACE*/
0x3002, /* ; STerm # Po IDEOGRAPHIC FULL STOP*/ 0x3002, /* IDEOGRAPHIC FULL STOP*/
0x300C, /* ; Quotation_Mark # Ps LEFT CORNER BRACKET*/ 0x300C, /* LEFT CORNER BRACKET*/
0x300D, /* ; Quotation_Mark # Pe RIGHT CORNER BRACKET*/ 0x300D, /* RIGHT CORNER BRACKET*/
0x300E, /* ; Quotation_Mark # Ps LEFT WHITE CORNER BRACKET*/ 0x300E, /* LEFT WHITE CORNER BRACKET*/
0x300F, /* ; Quotation_Mark # Pe RIGHT WHITE CORNER BRACKET*/ 0x300F, /* RIGHT WHITE CORNER BRACKET*/
0x301C, /* ; Dash # Pd WAVE DASH*/ 0x301C, /* WAVE DASH*/
0x301D, /* ; Quotation_Mark # Ps REVERSED DOUBLE PRIME QUOTATION MARK*/ 0x301D, /* REVERSED DOUBLE PRIME QUOTATION MARK*/
0x301E, /* ; Quotation_Mark # Pe LOW DOUBLE PRIME QUOTATION MARK*/ 0x301E, /* LOW DOUBLE PRIME QUOTATION MARK*/
0x3030, /* ; Dash # Pd WAVY DASH*/ 0x3030, /* WAVY DASH*/
0x30FB, /* ; Hyphen # Pc KATAKANA MIDDLE DOT*/ 0x30FB, /* KATAKANA MIDDLE DOT*/
0xC2B6, /* PILCROW SIGN;So;0;ON;;;;;N;PARAGRAPH SIGN;;;; */ 0xC2B6, /* PILCROW SIGN;So;0;ON;;;;;N;PARAGRAPH SIGN;;;; */
0xC3B7, /* DIVISION SIGN;Sm;0;ON;;;;;N;;;;; */ 0xC3B7, /* DIVISION SIGN;Sm;0;ON;;;;;N;;;;; */
0xFE31, /* ; Dash # Pd PRESENTATION FORM FOR VERTICAL EM DASH*/ 0xFE31, /* PRESENTATION FORM FOR VERTICAL EM DASH*/
0xFE32, /* ; Dash # Pd PRESENTATION FORM FOR VERTICAL EN DASH*/ 0xFE32, /* PRESENTATION FORM FOR VERTICAL EN DASH*/
0xFE41, /* ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET*/ 0xFE41, /* PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET*/
0xFE42, /* ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET*/ 0xFE42, /* PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET*/
0xFE43, /* ; Quotation_Mark # Ps PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET*/ 0xFE43, /* PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET*/
0xFE44, /* ; Quotation_Mark # Pe PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET*/ 0xFE44, /* PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET*/
0xFE50, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/ 0xFE50, /* [3] SMALL COMMA..SMALL FULL STOP*/
0xFE51, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/ 0xFE51, /* [3] SMALL COMMA..SMALL FULL STOP*/
0xFE52, /* ; STerm # Po SMALL FULL STOP*/ 0xFE52, /* STOP*/
0xFE52, /* ; Terminal_Punctuation # Po [3] SMALL COMMA..SMALL FULL STOP*/ 0xFE52, /* [3] SMALL COMMA..SMALL FULL STOP*/
0xFE54, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/ 0xFE54, /* [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
0xFE55, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/ 0xFE55, /* [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
0xFE56, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/ 0xFE56, /* [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
0xFE57, /* ; Terminal_Punctuation # Po [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/ 0xFE57, /* [4] SMALL SEMICOLON..SMALL EXCLAMATION MARK*/
0xFE58, /* ; Dash # Pd SMALL EM DASH*/ 0xFE58, /* SMALL EM DASH */
0xFE63, /* ; Hyphen # Pd SMALL HYPHEN-MINUS*/ 0xFE63, /* SMALL HYPHEN-MINUS */
0xFF01, /* FULLWIDTH EXCLAMATION MARK;Po;0;ON;<wide> 0021;;;;N;;;;; */ 0xFF01, /* FULLWIDTH EXCLAMATION MARK */
0xFF02, /* FULLWIDTH QUOTATION MARK;Po;0;ON;<wide> 0022;;;;N;;;;; */ 0xFF02, /* FULLWIDTH QUOTATION MARK */
0xFF03, /* FULLWIDTH NUMBER SIGN;Po;0;ET;<wide> 0023;;;;N;;;;; */ 0xFF03, /* FULLWIDTH NUMBER SIGN */
0xFF04, /* FULLWIDTH DOLLAR SIGN;Sc;0;ET;<wide> 0024;;;;N;;;;; */ 0xFF04, /* FULLWIDTH DOLLAR SIGN */
0xFF05, /* FULLWIDTH PERCENT SIGN;Po;0;ET;<wide> 0025;;;;N;;;;; */ 0xFF05, /* FULLWIDTH PERCENT SIGN */
0xFF06, /* FULLWIDTH AMPERSAND;Po;0;ON;<wide> 0026;;;;N;;;;; */ 0xFF06, /* FULLWIDTH AMPERSAND */
0xFF07, /* FULLWIDTH APOSTROPHE;Po;0;ON;<wide> 0027;;;;N;;;;; */ 0xFF07, /* FULLWIDTH APOSTROPHE */
0xFF08, /* FULLWIDTH LEFT PARENTHESIS;Ps;0;ON;<wide> 0028;;;;Y;FULLWIDTH OPENIN*/ 0xFF08, /* FULLWIDTH LEFT PARENTHESIS */
0xFF09, /* FULLWIDTH RIGHT PARENTHESIS;Pe;0;ON;<wide> 0029;;;;Y;FULLWIDTH CLOS*/ 0xFF09, /* FULLWIDTH RIGHT PARENTHESIS */
0xFF0A, /* FULLWIDTH ASTERISK;Po;0;ON;<wide> 002A;;;;N;;;;; */ 0xFF0A, /* FULLWIDTH ASTERISK */
0xFF0B, /* FULLWIDTH PLUS SIGN;Sm;0;ES;<wide> 002B;;;;N;;;;; */ 0xFF0B, /* FULLWIDTH PLUS SIGN */
0xFF0C, /* FULLWIDTH COMMA;Po;0;CS;<wide> 002C;;;;N;;;;; */ 0xFF0C, /* FULLWIDTH COMMA */
0xFF0D, /* FULLWIDTH HYPHEN-MINUS;Pd;0;ES;<wide> 002D;;;;N;;;;; */ 0xFF0D, /* FULLWIDTH HYPHEN-MINUS */
0xFF0E, /* FULLWIDTH FULL STOP;Po;0;CS;<wide> 002E;;;;N;FULLWIDTH PERIOD;;;; */ 0xFF0E, /* FULLWIDTH FULL STOP */
0xFF0F, /* FULLWIDTH SOLIDUS;Po;0;CS;<wide> 002F;;;;N;FULLWIDTH SLASH;;;; */ 0xFF0F, /* FULLWIDTH SOLIDUS */
0xFF1A, /* ; Terminal_Punctuation # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/ 0xFF1A, /* [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/
0xFF1B, /* ; Terminal_Punctuation # Po [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/ 0xFF1B, /* [2] FULLWIDTH COLON..FULLWIDTH SEMICOLON*/
0xFF1F, /* ; Terminal_Punctuation # Po FULLWIDTH QUESTION MARK*/ 0xFF1F, /* FULLWIDTH QUESTION MARK*/
0xFF61, /* ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC FULL STOP*/ 0xFF61, /* HALFWIDTH IDEOGRAPHIC FULL STOP*/
0xFF62, /* ; Quotation_Mark # Ps HALFWIDTH LEFT CORNER BRACKET*/ 0xFF62, /* HALFWIDTH LEFT CORNER BRACKET*/
0xFF63, /* ; Quotation_Mark # Pe HALFWIDTH RIGHT CORNER BRACKET*/ 0xFF63, /* HALFWIDTH RIGHT CORNER BRACKET*/
0xFF64, /* ; Terminal_Punctuation # Po HALFWIDTH IDEOGRAPHIC COMMA*/ 0xFF64, /* HALFWIDTH IDEOGRAPHIC COMMA*/
0xFF65, /* ; Hyphen # Pc HALFWIDTH KATAKANA MIDDLE DOT*/ 0xFF65, /* HALFWIDTH KATAKANA MIDDLE DOT*/
}; };
/* Things that would visibly break a block of text, rendering obvious the need /* Things that would visibly break a block of text, rendering obvious the need

View file

@ -25,6 +25,7 @@
#include <stdio.h> #include <stdio.h>
#include <QComboBox>
#include <qvariant.h> #include <qvariant.h>
#include <qwidget.h> #include <qwidget.h>

View file

@ -301,6 +301,7 @@ void RclMain::init()
connect(restable, SIGNAL(docSaveToFileClicked(Rcl::Doc)), connect(restable, SIGNAL(docSaveToFileClicked(Rcl::Doc)),
this, SLOT(saveDocToFile(Rcl::Doc))); this, SLOT(saveDocToFile(Rcl::Doc)));
reslist->setRclMain(this);
connect(this, SIGNAL(docSourceChanged(RefCntr<DocSequence>)), connect(this, SIGNAL(docSourceChanged(RefCntr<DocSequence>)),
reslist, SLOT(setDocSource(RefCntr<DocSequence>))); reslist, SLOT(setDocSource(RefCntr<DocSequence>)));
connect(firstPageAction, SIGNAL(activated()), connect(firstPageAction, SIGNAL(activated()),
@ -931,8 +932,12 @@ void RclMain::showIndexSched(bool modal)
connect(indexSched->cronCLB, SIGNAL(clicked()), connect(indexSched->cronCLB, SIGNAL(clicked()),
this, SLOT(execCronTool())); this, SLOT(execCronTool()));
if (theconfig && theconfig->isDefaultConfig()) { if (theconfig && theconfig->isDefaultConfig()) {
#ifdef RCL_MONITOR
connect(indexSched->rtidxCLB, SIGNAL(clicked()), connect(indexSched->rtidxCLB, SIGNAL(clicked()),
this, SLOT(execRTITool())); this, SLOT(execRTITool()));
#else
indexSched->rtidxCLB->setEnabled(false);
#endif
} else { } else {
indexSched->rtidxCLB->setEnabled(false); indexSched->rtidxCLB->setEnabled(false);
} }
@ -1493,8 +1498,9 @@ static bool lookForHtmlBrowser(string &exefile)
return false; return false;
} }
void RclMain::startNativeViewer(Rcl::Doc doc) void RclMain::startNativeViewer(Rcl::Doc doc, int pagenum)
{ {
LOGDEB(("RclMain::startNativeViewer: page %d\n", pagenum));
// Look for appropriate viewer // Look for appropriate viewer
string cmdplusattr; string cmdplusattr;
if (prefs.useDesktopOpen) { if (prefs.useDesktopOpen) {
@ -1512,11 +1518,13 @@ void RclMain::startNativeViewer(Rcl::Doc doc)
return; return;
} }
int pagenum = 1; if (pagenum == -1) {
if (m_source.isNotNull())
pagenum = m_source->getFirstMatchPage(doc);
if (pagenum == -1)
pagenum = 1; pagenum = 1;
if (m_source.isNotNull())
pagenum = m_source->getFirstMatchPage(doc);
if (pagenum == -1)
pagenum = 1;
}
char cpagenum[20]; char cpagenum[20];
sprintf(cpagenum, "%d", pagenum); sprintf(cpagenum, "%d", pagenum);

View file

@ -119,7 +119,7 @@ public slots:
virtual void docExpand(Rcl::Doc); virtual void docExpand(Rcl::Doc);
virtual void startPreview(int docnum, Rcl::Doc doc, int keymods); virtual void startPreview(int docnum, Rcl::Doc doc, int keymods);
virtual void startPreview(Rcl::Doc); virtual void startPreview(Rcl::Doc);
virtual void startNativeViewer(Rcl::Doc); virtual void startNativeViewer(Rcl::Doc, int pagenum = -1);
virtual void saveDocToFile(Rcl::Doc); virtual void saveDocToFile(Rcl::Doc);
virtual void previewNextInTab(Preview *, int sid, int docnum); virtual void previewNextInTab(Preview *, int sid, int docnum);
virtual void previewPrevInTab(Preview *, int sid, int docnum); virtual void previewPrevInTab(Preview *, int sid, int docnum);

View file

@ -25,6 +25,7 @@ HEADERS += \
restable.h \ restable.h \
rtitool.h \ rtitool.h \
searchclause_w.h \ searchclause_w.h \
snippets_w.h \
spell_w.h \ spell_w.h \
ssearch_w.h \ ssearch_w.h \
uiprefs_w.h \ uiprefs_w.h \
@ -46,6 +47,7 @@ SOURCES += \
restable.cpp \ restable.cpp \
rtitool.cpp \ rtitool.cpp \
searchclause_w.cpp \ searchclause_w.cpp \
snippets_w.cpp \
spell_w.cpp \ spell_w.cpp \
ssearch_w.cpp \ ssearch_w.cpp \
uiprefs_w.cpp \ uiprefs_w.cpp \
@ -64,6 +66,7 @@ FORMS = \
restable.ui \ restable.ui \
rtitool.ui \ rtitool.ui \
spell.ui \ spell.ui \
snippets.ui \
ssearchb.ui \ ssearchb.ui \
uiprefs.ui \ uiprefs.ui \
viewaction.ui \ viewaction.ui \

View file

@ -50,6 +50,7 @@
#include "refcntr.h" #include "refcntr.h"
#include "internfile.h" #include "internfile.h"
#include "indexer.h" #include "indexer.h"
#include "snippets_w.h"
#include "reslist.h" #include "reslist.h"
#include "moc_reslist.cpp" #include "moc_reslist.cpp"
@ -281,7 +282,7 @@ static PlainToRichQtReslist g_hiliter;
///////////////////////////////////// /////////////////////////////////////
ResList::ResList(QWidget* parent, const char* name) ResList::ResList(QWidget* parent, const char* name)
: RESLIST_PARENTCLASS(parent) : RESLIST_PARENTCLASS(parent), m_parent(0)
{ {
if (!name) if (!name)
setObjectName("resList"); setObjectName("resList");
@ -902,6 +903,9 @@ void ResList::createPopupMenu(const QPoint& pos)
this, SLOT(menuPreviewParent())); this, SLOT(menuPreviewParent()));
popup->addAction(tr("&Open Parent document/folder"), popup->addAction(tr("&Open Parent document/folder"),
this, SLOT(menuOpenParent())); this, SLOT(menuOpenParent()));
if (m_source->snippetsCapable())
popup->addAction(tr("Open &Snippets window"),
this, SLOT(menuOpenSnippets()));
popup->popup(mapToGlobal(pos)); popup->popup(mapToGlobal(pos));
} }
@ -953,6 +957,20 @@ void ResList::menuOpenParent()
} }
} }
void ResList::menuOpenSnippets()
{
Rcl::Doc doc;
if (!getDoc(m_popDoc, doc) || m_source.isNull())
return;
SnippetsW *sp = new SnippetsW(doc, m_source);
if (m_parent) {
connect(sp, SIGNAL(startNativeViewer(Rcl::Doc, int)),
m_parent, SLOT(startNativeViewer(Rcl::Doc, int)));
}
sp->show();
}
void ResList::menuEdit() void ResList::menuEdit()
{ {
Rcl::Doc doc; Rcl::Doc doc;

View file

@ -41,6 +41,7 @@ using std::pair;
#include "rcldoc.h" #include "rcldoc.h"
#include "reslistpager.h" #include "reslistpager.h"
class RclMain;
class QtGuiResListPager; class QtGuiResListPager;
/** /**
@ -66,7 +67,10 @@ class ResList : public RESLIST_PARENTCLASS
int listId() const {return m_listId;} int listId() const {return m_listId;}
int pageFirstDocNum(); int pageFirstDocNum();
void setFont(); void setFont();
void setRclMain(RclMain *m)
{
m_parent = m;
}
public slots: public slots:
virtual void setDocSource(RefCntr<DocSequence> nsource); virtual void setDocSource(RefCntr<DocSequence> nsource);
virtual void resetList(); // Erase current list virtual void resetList(); // Erase current list
@ -84,6 +88,7 @@ class ResList : public RESLIST_PARENTCLASS
virtual void menuExpand(); virtual void menuExpand();
virtual void menuPreviewParent(); virtual void menuPreviewParent();
virtual void menuOpenParent(); virtual void menuOpenParent();
virtual void menuOpenSnippets();
virtual void previewExposed(int); virtual void previewExposed(int);
virtual void append(const QString &text); virtual void append(const QString &text);
virtual void readDocSource(); virtual void readDocSource();
@ -132,6 +137,7 @@ class ResList : public RESLIST_PARENTCLASS
// so we store the page and display it when done. // so we store the page and display it when done.
QString m_text; QString m_text;
#endif #endif
RclMain *m_parent;
virtual void displayPage(); // Display current page virtual void displayPage(); // Display current page
static int newListId(); static int newListId();

67
src/qtgui/snippets.ui Normal file
View file

@ -0,0 +1,67 @@
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>Snippets</class>
<widget class="QDialog" name="Snippets">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>640</width>
<height>400</height>
</rect>
</property>
<property name="windowTitle">
<string>Snippets</string>
</property>
<property name="sizeGripEnabled">
<bool>true</bool>
</property>
<layout class="QVBoxLayout" name="verticalLayout">
<item>
<widget class="QWebView" name="webView">
<property name="url">
<url>
<string>about:blank</string>
</url>
</property>
</widget>
</item>
<item>
<widget class="QDialogButtonBox" name="buttonBox">
<property name="orientation">
<enum>Qt::Horizontal</enum>
</property>
<property name="standardButtons">
<set>QDialogButtonBox::Close</set>
</property>
</widget>
</item>
</layout>
</widget>
<customwidgets>
<customwidget>
<class>QWebView</class>
<extends>QWidget</extends>
<header>QtWebKit/QWebView</header>
</customwidget>
</customwidgets>
<resources/>
<connections>
<connection>
<sender>buttonBox</sender>
<signal>clicked(QAbstractButton*)</signal>
<receiver>Snippets</receiver>
<slot>close()</slot>
<hints>
<hint type="sourcelabel">
<x>257</x>
<y>369</y>
</hint>
<hint type="destinationlabel">
<x>257</x>
<y>197</y>
</hint>
</hints>
</connection>
</connections>
</ui>

124
src/qtgui/snippets_w.cpp Normal file
View file

@ -0,0 +1,124 @@
/* Copyright (C) 2012 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "autoconfig.h"
#include <unistd.h>
#include <stdio.h>
#include <string>
#include <vector>
using namespace std;
#include "debuglog.h"
#include "recoll.h"
#include "snippets_w.h"
#include "guiutils.h"
#include "rcldb.h"
#include "rclhelp.h"
#include "plaintorich.h"
class PlainToRichQtSnippets : public PlainToRich {
public:
virtual string startMatch(unsigned int)
{
return string("<span class='rclmatch' style='color: ")
+ string((const char *)prefs.qtermcolor.toAscii()) + string("'>");
}
virtual string endMatch()
{
return string("</span>");
}
};
static PlainToRichQtSnippets g_hiliter;
void SnippetsW::init()
{
if (m_source.isNull())
return;
// Make title out of file name if none yet
string titleOrFilename;
string utf8fn;
m_doc.getmeta(Rcl::Doc::keytt, &titleOrFilename);
m_doc.getmeta(Rcl::Doc::keyfn, &utf8fn);
if (titleOrFilename.empty()) {
titleOrFilename = utf8fn;
}
setWindowTitle(QString::fromUtf8(titleOrFilename.c_str()));
vector<pair<int, string> > vpabs;
m_source->getAbstract(m_doc, vpabs);
HighlightData hdata;
m_source->getTerms(hdata);
QString html = QString::fromAscii(
"<html><head>"
"<meta http-equiv=\"content-type\" "
"content=\"text/html; charset=utf-8\"></head>"
"<body style='overflow-x: scroll; white-space: nowrap'>"
"<table>"
);
g_hiliter.set_inputhtml(false);
for (vector<pair<int, string> >::const_iterator it = vpabs.begin();
it != vpabs.end(); it++) {
html += "<tr><td>";
if (it->first > 0) {
char buf[100];
sprintf(buf, "P.&nbsp;%d", it->first);
html += "<a href=\"";
html += buf;
html += "\">";
html += buf;
html += "</a>";
}
html += "</td><td>";
list<string> lr;
g_hiliter.plaintorich(it->second, lr, hdata);
html.append(QString::fromUtf8(lr.front().c_str()));
html.append("</td></tr>\n");
}
html.append("</body></html>");
webView->setHtml(html);
connect(webView, SIGNAL(linkClicked(const QUrl &)),
this, SLOT(linkWasClicked(const QUrl &)));
webView->page()->setLinkDelegationPolicy(QWebPage::DelegateAllLinks);
}
void SnippetsW::linkWasClicked(const QUrl &url)
{
string ascurl = (const char *)url.toString().toAscii();;
LOGDEB(("Snippets::linkWasClicked: [%s]\n", ascurl.c_str()));
if (ascurl.size() > 3) {
int what = ascurl[0];
switch (what) {
case 'P':
{
int page = atoi(ascurl.c_str()+2);
emit startNativeViewer(m_doc, page);
return;
}
}
}
LOGERR(("Snippets::linkWasClicked: bad link [%s]\n", ascurl.c_str()));
}

50
src/qtgui/snippets_w.h Normal file
View file

@ -0,0 +1,50 @@
/* Copyright (C) 2012 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _SNIPPETS_W_H_INCLUDED_
#define _SNIPPETS_W_H_INCLUDED_
#include "rcldoc.h"
#include "refcntr.h"
#include "docseq.h"
#include "rclmain_w.h"
#include "ui_snippets.h"
class SnippetsW : public QWidget, public Ui::Snippets
{
Q_OBJECT
public:
SnippetsW(Rcl::Doc doc, RefCntr<DocSequence> source, QWidget* parent = 0)
: QWidget(parent), m_doc(doc), m_source(source)
{
setupUi((QDialog*)this);
init();
}
protected slots:
virtual void linkWasClicked(const QUrl &);
signals:
void startNativeViewer(Rcl::Doc, int pagenum);
private:
void init();
Rcl::Doc m_doc;
RefCntr<DocSequence> m_source;
};
#endif /* _SNIPPETS_W_H_INCLUDED_ */

View file

@ -95,6 +95,13 @@ class DocSequence {
abs.push_back(doc.meta[Rcl::Doc::keyabs]); abs.push_back(doc.meta[Rcl::Doc::keyabs]);
return true; return true;
} }
virtual bool getAbstract(Rcl::Doc& doc,
std::vector<std::pair<int, std::string> >& abs)
{
abs.push_back(std::pair<int, std::string>(0,
doc.meta[Rcl::Doc::keyabs]));
return true;
}
virtual int getFirstMatchPage(Rcl::Doc&) virtual int getFirstMatchPage(Rcl::Doc&)
{ {
return -1; return -1;
@ -106,8 +113,16 @@ class DocSequence {
virtual int getResCnt() = 0; virtual int getResCnt() = 0;
/** Get title for result list */ /** Get title for result list */
virtual std::string title() {return m_title;} virtual std::string title()
{
return m_title;
}
/** Can do snippets ? */
virtual bool snippetsCapable()
{
return false;
}
/** Get description for underlying query */ /** Get description for underlying query */
virtual std::string getDescription() = 0; virtual std::string getDescription() = 0;
@ -157,6 +172,20 @@ public:
return false; return false;
return m_seq->getAbstract(doc, abs); return m_seq->getAbstract(doc, abs);
} }
virtual bool getAbstract(Rcl::Doc& doc,
std::vector<std::pair<int, std::string> >& abs)
{
if (m_seq.isNull())
return false;
return m_seq->getAbstract(doc, abs);
}
virtual bool snippetsCapable()
{
if (m_seq.isNull())
return false;
return m_seq->snippetsCapable();
}
virtual std::string getDescription() virtual std::string getDescription()
{ {
if (m_seq.isNull()) if (m_seq.isNull())

View file

@ -65,6 +65,32 @@ int DocSequenceDb::getResCnt()
return m_rescnt; return m_rescnt;
} }
// This one only gets called to fill-up the snippets window
// We ignore most abstract/snippets preferences.
bool DocSequenceDb::getAbstract(Rcl::Doc &doc,
vector<pair<int, string> >& vpabs)
{
LOGDEB(("DocSequenceDb::getAbstract/pair\n"));
setQuery();
// Have to put the limit somewhere.
int maxoccs = 500;
Rcl::abstract_result ret = Rcl::ABSRES_ERROR;
if (m_q->whatDb()) {
ret = m_q->whatDb()->makeDocAbstract(doc, m_q.getptr(), vpabs,
maxoccs,
m_q->whatDb()->getAbsCtxLen()+ 2);
}
if (vpabs.empty())
vpabs.push_back(pair<int, string>(0, doc.meta[Rcl::Doc::keyabs]));
// If the list was probably truncated, indicate it.
if (ret == Rcl::ABSRES_TRUNC)
vpabs.push_back(pair<int, string>(-1, "[...]"));
return true;
}
bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<string>& vabs) bool DocSequenceDb::getAbstract(Rcl::Doc &doc, vector<string>& vabs)
{ {
setQuery(); setQuery();

View file

@ -31,6 +31,11 @@ class DocSequenceDb : public DocSequence {
virtual bool getDoc(int num, Rcl::Doc &doc, string * = 0); virtual bool getDoc(int num, Rcl::Doc &doc, string * = 0);
virtual int getResCnt(); virtual int getResCnt();
virtual void getTerms(HighlightData& hld); virtual void getTerms(HighlightData& hld);
// Called to fill-up the snippets window. Ignoers
// buildabstract/replaceabstract and syntabslen
virtual bool getAbstract(Rcl::Doc &doc, vector<pair<int, string> >&);
virtual bool getAbstract(Rcl::Doc &doc, vector<string>&); virtual bool getAbstract(Rcl::Doc &doc, vector<string>&);
virtual int getFirstMatchPage(Rcl::Doc&); virtual int getFirstMatchPage(Rcl::Doc&);
virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc); virtual bool getEnclosing(Rcl::Doc& doc, Rcl::Doc& pdoc);
@ -45,6 +50,11 @@ class DocSequenceDb : public DocSequence {
m_queryBuildAbstract = qba; m_queryBuildAbstract = qba;
m_queryReplaceAbstract = qra; m_queryReplaceAbstract = qra;
} }
virtual bool snippetsCapable()
{
return true;
}
virtual string title(); virtual string title();
private: private:

View file

@ -230,7 +230,9 @@ static void listList(const string&, const vector<string>&)
} }
#endif #endif
// Retrieve and store db-wide frequencies for the query terms. // Retrieve db-wide frequencies for the query terms and store them in
// the query object. This is done at most once for a query, and the data is used
// while computing abstracts for the different result documents.
void Db::Native::setDbWideQTermsFreqs(Query *query) void Db::Native::setDbWideQTermsFreqs(Query *query)
{ {
// Do it once only for a given query. // Do it once only for a given query.
@ -252,7 +254,7 @@ void Db::Native::setDbWideQTermsFreqs(Query *query)
for (vector<string>::const_iterator qit = qterms.begin(); for (vector<string>::const_iterator qit = qterms.begin();
qit != qterms.end(); qit++) { qit != qterms.end(); qit++) {
query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt; query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(), LOGABS(("set..QTermFreqs: [%s] db freq %.1e\n", qit->c_str(),
query->m_nq->termfreqs[*qit])); query->m_nq->termfreqs[*qit]));
} }
} }
@ -306,6 +308,7 @@ double Db::Native::qualityTerms(Xapian::docid docid,
} }
#ifdef DEBUGABSTRACT #ifdef DEBUGABSTRACT
LOGDEB(("Db::qualityTerms:\n"));
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin(); for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
qit != byQ.rend(); qit++) { qit != byQ.rend(); qit++) {
LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str())); LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
@ -317,6 +320,7 @@ double Db::Native::qualityTerms(Xapian::docid docid,
// Return the positions list for the page break term // Return the positions list for the page break term
bool Db::Native::getPagePositions(Xapian::docid docid, vector<int>& vpos) bool Db::Native::getPagePositions(Xapian::docid docid, vector<int>& vpos)
{ {
vpos.clear();
// Need to retrieve the document record to check for multiple page breaks // Need to retrieve the document record to check for multiple page breaks
// that we store there for lack of better place // that we store there for lack of better place
map<int, int> mbreaksmap; map<int, int> mbreaksmap;
@ -422,25 +426,26 @@ int Db::Native::getFirstMatchPage(Xapian::docid docid, Query *query)
// //
// DatabaseModified and other general exceptions are catched and // DatabaseModified and other general exceptions are catched and
// possibly retried by our caller // possibly retried by our caller
vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query) abstract_result Db::Native::makeAbstract(Xapian::docid docid, Query *query,
vector<pair<int, string> >& vabs,
int imaxoccs, int ictxwords)
{ {
Chrono chron; Chrono chron;
LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(), LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d imaxoccs %d\n", chron.ms(),
m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen)); m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen, imaxoccs));
// The (unprefixed) terms matched by this document // The (unprefixed) terms matched by this document
vector<string> terms; vector<string> matchedTerms;
{ {
vector<string> iterms; vector<string> iterms;
query->getMatchTerms(docid, iterms); query->getMatchTerms(docid, iterms);
noPrefixList(iterms, terms); noPrefixList(iterms, matchedTerms);
if (terms.empty()) { if (matchedTerms.empty()) {
LOGDEB(("makeAbstract::Empty term list\n")); LOGDEB(("makeAbstract::Empty term list\n"));
return vector<string>(); return ABSRES_ERROR;
} }
} }
listList("Match terms: ", terms); listList("Match terms: ", matchedTerms);
// Retrieve the term freqencies for the query terms. This is // Retrieve the term freqencies for the query terms. This is
// actually computed only once for a query, and for all terms in // actually computed only once for a query, and for all terms in
@ -455,12 +460,12 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
// removing its meaning from the maximum occurrences per term test // removing its meaning from the maximum occurrences per term test
// used while walking the list below) // used while walking the list below)
multimap<double, string> byQ; multimap<double, string> byQ;
double totalweight = qualityTerms(docid, query, terms, byQ); double totalweight = qualityTerms(docid, query, matchedTerms, byQ);
LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms())); LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
// This can't happen, but would crash us // This can't happen, but would crash us
if (totalweight == 0.0) { if (totalweight == 0.0) {
LOGERR(("makeAbstract: totalweight == 0.0 !\n")); LOGERR(("makeAbstract: totalweight == 0.0 !\n"));
return vector<string>(); return ABSRES_ERROR;
} }
/////////////////// ///////////////////
@ -473,21 +478,25 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
// terms, at their positions around the search terms positions: // terms, at their positions around the search terms positions:
map<unsigned int, string> sparseDoc; map<unsigned int, string> sparseDoc;
// All the chosen query term positions. // Total number of occurences for all terms. We stop when we have too much
vector<unsigned int> qtermposs; unsigned int totaloccs = 0;
// Limit the total number of slots we populate. The 7 is taken as // Limit the total number of slots we populate. The 7 is taken as
// average word size. It was a mistake to have the user max // average word size. It was a mistake to have the user max
// abstract size parameter in characters, we basically only deal // abstract size parameter in characters, we basically only deal
// with words. We used to limit the character size at the end, but // with words. We used to limit the character size at the end, but
// this damaged our careful selection of terms // this damaged our careful selection of terms
const unsigned int maxtotaloccs = const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1)); m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1));
LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs)); int ctxwords = ictxwords == -1 ? m_rcldb->m_synthAbsWordCtxLen : ictxwords;
LOGABS(("makeAbstract:%d: mxttloccs %d ctxwords %d\n",
chron.ms(), maxtotaloccs, ctxwords));
// This is used to mark positions overlapped by a multi-word match term // This is used to mark positions overlapped by a multi-word match term
const string occupiedmarker("?"); const string occupiedmarker("?");
abstract_result ret = ABSRES_OK;
// Let's go populate // Let's go populate
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin(); for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
qit != byQ.rend(); qit++) { qit != byQ.rend(); qit++) {
@ -508,7 +517,10 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
Xapian::PositionIterator pos; Xapian::PositionIterator pos;
// There may be query terms not in this doc. This raises an // There may be query terms not in this doc. This raises an
// exception when requesting the position list, we catch it. // exception when requesting the position list, we catch it ??
// Not clear how this can happen because we are walking the
// match list returned by Xapian. Maybe something with the
// fields?
string emptys; string emptys;
try { try {
unsigned int occurrences = 0; unsigned int occurrences = 0;
@ -519,14 +531,14 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
continue; continue;
LOGABS(("makeAbstract: [%s] at %d occurrences %d maxoccs %d\n", LOGABS(("makeAbstract: [%s] at %d occurrences %d maxoccs %d\n",
qterm.c_str(), ipos, occurrences, maxoccs)); qterm.c_str(), ipos, occurrences, maxoccs));
// Remember the term position
qtermposs.push_back(ipos); totaloccs++;
// Add adjacent slots to the set to populate at next // Add adjacent slots to the set to populate at next
// step by inserting empty strings. Special provisions // step by inserting empty strings. Special provisions
// for adding ellipsis and for positions overlapped by // for adding ellipsis and for positions overlapped by
// the match term. // the match term.
unsigned int sta = MAX(0, ipos-m_rcldb->m_synthAbsWordCtxLen); unsigned int sta = MAX(0, ipos - ctxwords);
unsigned int sto = ipos + qtrmwrdcnt-1 + unsigned int sto = ipos + qtrmwrdcnt-1 +
m_rcldb->m_synthAbsWordCtxLen; m_rcldb->m_synthAbsWordCtxLen;
for (unsigned int ii = sta; ii <= sto; ii++) { for (unsigned int ii = sta; ii <= sto; ii++) {
@ -552,23 +564,29 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
// Limit to allocated occurences and total size // Limit to allocated occurences and total size
if (++occurrences >= maxoccs || if (++occurrences >= maxoccs ||
qtermposs.size() >= maxtotaloccs) totaloccs >= maxtotaloccs) {
ret = ABSRES_TRUNC;
LOGDEB(("Db::makeAbstract: max occurrences cutoff\n"));
break; break;
}
} }
} catch (...) { } catch (...) {
// Term does not occur. No problem. // Term does not occur. No problem.
} }
if (qtermposs.size() >= maxtotaloccs) if (totaloccs >= maxtotaloccs) {
ret = ABSRES_TRUNC;
LOGDEB(("Db::makeAbstract: max1 occurrences cutoff\n"));
break; break;
}
} }
LOGABS(("makeAbstract:%d:chosen number of positions %d\n", LOGABS(("makeAbstract:%d:chosen number of positions %d\n",
chron.millis(), qtermposs.size())); chron.millis(), totaloccs));
// This can happen if there are term occurences in the keywords // This can happen if there are term occurences in the keywords
// etc. but not elsewhere ? // etc. but not elsewhere ?
if (qtermposs.size() == 0) { if (totaloccs == 0) {
LOGDEB1(("makeAbstract: no occurrences\n")); LOGDEB1(("makeAbstract: no occurrences\n"));
return vector<string>(); return ABSRES_ERROR;
} }
// Walk all document's terms position lists and populate slots // Walk all document's terms position lists and populate slots
@ -586,6 +604,7 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
if (has_prefix(*term)) if (has_prefix(*term))
continue; continue;
if (cutoff-- < 0) { if (cutoff-- < 0) {
ret = ABSRES_TRUNC;
LOGDEB0(("makeAbstract: max term count cutoff\n")); LOGDEB0(("makeAbstract: max term count cutoff\n"));
break; break;
} }
@ -594,6 +613,7 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
for (pos = xrdb.positionlist_begin(docid, *term); for (pos = xrdb.positionlist_begin(docid, *term);
pos != xrdb.positionlist_end(docid, *term); pos++) { pos != xrdb.positionlist_end(docid, *term); pos++) {
if (cutoff-- < 0) { if (cutoff-- < 0) {
ret = ABSRES_TRUNC;
LOGDEB0(("makeAbstract: max term count cutoff\n")); LOGDEB0(("makeAbstract: max term count cutoff\n"));
break; break;
} }
@ -604,8 +624,8 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
// at the same position, we want to keep only the // at the same position, we want to keep only the
// first one (ie: dockes and dockes@wanadoo.fr) // first one (ie: dockes and dockes@wanadoo.fr)
if (vit->second.empty()) { if (vit->second.empty()) {
LOGABS(("makeAbstract: populating: [%s] at %d\n", LOGDEB2(("makeAbstract: populating: [%s] at %d\n",
(*term).c_str(), *pos)); (*term).c_str(), *pos));
sparseDoc[*pos] = *term; sparseDoc[*pos] = *term;
} }
} }
@ -637,19 +657,19 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
LOGABS(("makeAbstract:%d: extracting. Got %u pages\n", chron.millis(), LOGABS(("makeAbstract:%d: extracting. Got %u pages\n", chron.millis(),
vpbreaks.size())); vpbreaks.size()));
// Finally build the abstract by walking the map (in order of position) // Finally build the abstract by walking the map (in order of position)
vector<string> vabs; vabs.clear();
string chunk; string chunk;
bool incjk = false; bool incjk = false;
int page = 0;
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin(); for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
it != sparseDoc.end(); it++) { it != sparseDoc.end(); it++) {
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str())); LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
if (!occupiedmarker.compare(it->second)) if (!occupiedmarker.compare(it->second))
continue; continue;
if (chunk.empty() && !vpbreaks.empty()) { if (chunk.empty() && !vpbreaks.empty()) {
int pnum = getPageNumberForPosition(vpbreaks, it->first); page = getPageNumberForPosition(vpbreaks, it->first);
ostringstream ss; if (page < 0)
ss << pnum; page = 0;
chunk += string(" [p ") + ss.str() + "] ";
} }
Utf8Iter uit(it->second); Utf8Iter uit(it->second);
bool newcjk = false; bool newcjk = false;
@ -659,7 +679,7 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
chunk += " "; chunk += " ";
incjk = newcjk; incjk = newcjk;
if (it->second == cstr_ellipsis) { if (it->second == cstr_ellipsis) {
vabs.push_back(chunk); vabs.push_back(pair<int,string>(page, chunk));
chunk.clear(); chunk.clear();
} else { } else {
if (it->second.compare(end_of_field_term) && if (it->second.compare(end_of_field_term) &&
@ -668,10 +688,10 @@ vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
} }
} }
if (!chunk.empty()) if (!chunk.empty())
vabs.push_back(chunk); vabs.push_back(pair<int, string>(page, chunk));
LOGDEB2(("makeAbtract: done in %d mS\n", chron.millis())); LOGDEB2(("makeAbtract: done in %d mS\n", chron.millis()));
return vabs; return ret;
} }
/* Rcl::Db methods ///////////////////////////////// */ /* Rcl::Db methods ///////////////////////////////// */
@ -1516,6 +1536,8 @@ bool Db::addOrUpdate(const string &udi, const string &parent_udi,
if (!tpidx.m_pageincrvec.empty()) { if (!tpidx.m_pageincrvec.empty()) {
ostringstream multibreaks; ostringstream multibreaks;
for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) { for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
if (i != 0)
multibreaks << ",";
multibreaks << tpidx.m_pageincrvec[i].first << "," << multibreaks << tpidx.m_pageincrvec[i].first << "," <<
tpidx.m_pageincrvec[i].second; tpidx.m_pageincrvec[i].second;
} }
@ -2168,31 +2190,59 @@ bool Db::stemDiffers(const string& lang, const string& word,
return true; return true;
} }
abstract_result Db::makeDocAbstract(Doc &doc, Query *query,
vector<pair<int, string> >& abstract,
int maxoccs, int ctxwords)
{
LOGDEB(("makeDocAbstract: maxoccs %d ctxwords %d\n", maxoccs, ctxwords));
if (!m_ndb || !m_ndb->m_isopen) {
LOGERR(("Db::makeDocAbstract: no db\n"));
return ABSRES_ERROR;
}
abstract_result ret = ABSRES_ERROR;
XAPTRY(ret = m_ndb->makeAbstract(doc.xdocid, query, abstract,
maxoccs, ctxwords),
m_ndb->xrdb, m_reason);
if (!m_reason.empty())
return ABSRES_ERROR;
return ret;
}
bool Db::makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract) bool Db::makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract)
{ {
LOGDEB1(("Db::makeDocAbstract: exti %d\n", exti));
if (!m_ndb || !m_ndb->m_isopen) { if (!m_ndb || !m_ndb->m_isopen) {
LOGERR(("Db::makeDocAbstract: no db\n")); LOGERR(("Db::makeDocAbstract: no db\n"));
return false; return false;
} }
XAPTRY(abstract = m_ndb->makeAbstract(doc.xdocid, query), vector<pair<int, string> > vpabs;
m_ndb->xrdb, m_reason); if (!makeDocAbstract(doc, query, vpabs))
return m_reason.empty() ? true : false; return false;
for (vector<pair<int, string> >::const_iterator it = vpabs.begin();
it != vpabs.end(); it++) {
string chunk;
if (it->first > 0) {
ostringstream ss;
ss << it->first;
chunk += string(" [p ") + ss.str() + "] ";
}
chunk += it->second;
abstract.push_back(chunk);
}
return true;
} }
bool Db::makeDocAbstract(Doc &doc, Query *query, string& abstract) bool Db::makeDocAbstract(Doc &doc, Query *query, string& abstract)
{ {
LOGDEB1(("Db::makeDocAbstract: exti %d\n", exti));
if (!m_ndb || !m_ndb->m_isopen) { if (!m_ndb || !m_ndb->m_isopen) {
LOGERR(("Db::makeDocAbstract: no db\n")); LOGERR(("Db::makeDocAbstract: no db\n"));
return false; return false;
} }
vector<string> vab; vector<pair<int, string> > vpabs;
XAPTRY(vab = m_ndb->makeAbstract(doc.xdocid, query), if (!makeDocAbstract(doc, query, vpabs))
m_ndb->xrdb, m_reason); return false;
for (vector<string>::const_iterator it = vab.begin(); for (vector<pair<int, string> >::const_iterator it = vpabs.begin();
it != vab.end(); it++) { it != vpabs.end(); it++) {
abstract.append(*it); abstract.append(it->second);
abstract.append(cstr_ellipsis); abstract.append(cstr_ellipsis);
} }
return m_reason.empty() ? true : false; return m_reason.empty() ? true : false;

View file

@ -68,6 +68,11 @@ enum value_slot {
VALUE_SIG = 10 // Doc sig as chosen by app (ex: mtime+size VALUE_SIG = 10 // Doc sig as chosen by app (ex: mtime+size
}; };
enum abstract_result {
ABSRES_ERROR = 0,
ABSRES_OK = 1,
ABSRES_TRUNC = 2
};
class SearchData; class SearchData;
class TermIter; class TermIter;
class Query; class Query;
@ -291,11 +296,21 @@ class Db {
/** Set parameters for synthetic abstract generation */ /** Set parameters for synthetic abstract generation */
void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen); void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen);
int getAbsCtxLen() const
{
return m_synthAbsWordCtxLen;
}
/** Build synthetic abstract for document, extracting chunks relevant for /** Build synthetic abstract for document, extracting chunks relevant for
* the input query. This uses index data only (no access to the file) */ * the input query. This uses index data only (no access to the file) */
// Abstract return as one string
bool makeDocAbstract(Doc &doc, Query *query, string& abstract); bool makeDocAbstract(Doc &doc, Query *query, string& abstract);
// Returned as a snippets vector
bool makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract); bool makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract);
// Returned as a vector of pair<page,snippet> page is 0 if unknown
abstract_result makeDocAbstract(Doc &doc, Query *query,
vector<pair<int, string> >& abstract,
int maxoccs= -1, int ctxwords = -1);
/** Retrieve detected page breaks positions */ /** Retrieve detected page breaks positions */
int getFirstMatchPage(Doc &doc, Query *query); int getFirstMatchPage(Doc &doc, Query *query);

View file

@ -94,7 +94,9 @@ class Db::Native {
const vector<string>& terms, const vector<string>& terms,
std::multimap<double, string>& byQ); std::multimap<double, string>& byQ);
void setDbWideQTermsFreqs(Query *query); void setDbWideQTermsFreqs(Query *query);
vector<string> makeAbstract(Xapian::docid id, Query *query); abstract_result makeAbstract(Xapian::docid id, Query *query,
vector<pair<int, string> >&, int maxoccs = -1,
int ctxwords = -1);
bool getPagePositions(Xapian::docid docid, vector<int>& vpos); bool getPagePositions(Xapian::docid docid, vector<int>& vpos);
int getFirstMatchPage(Xapian::docid docid, Query *query); int getFirstMatchPage(Xapian::docid docid, Query *query);
int getPageNumberForPosition(const vector<int>& pbreaks, unsigned int pos); int getPageNumberForPosition(const vector<int>& pbreaks, unsigned int pos);

View file

@ -81,6 +81,8 @@ indexstemminglanguages = english
# unac_except_trans = Ää Öö Üü ää öö üü ßss # unac_except_trans = Ää Öö Üü ää öö üü ßss
# In French, you probably want to decompose oe and ae # In French, you probably want to decompose oe and ae
# unac_except_trans = œoe Œoe æae Æae # unac_except_trans = œoe Œoe æae Æae
# Actually, this seems a reasonable default for all until someone protests.
unac_except_trans = åå Åå ää Ää öö Öö üü Üü ßss œoe Œoe æae ÆAE fifi flfl
# Where to store the database (directory). This may be an absolute path, # Where to store the database (directory). This may be an absolute path,
# else it is taken as relative to the configuration directory (-c argument # else it is taken as relative to the configuration directory (-c argument

View file

@ -31,9 +31,9 @@
#include <map> #include <map>
#include <string> #include <string>
#include <algorithm> #include <algorithm>
#include <tr1/unordered_map>
using std::string; using std::string;
using std::vector; using std::tr1::unordered_map;
using std::map;
#include "smallut.h" #include "smallut.h"
/* /*
@ -41,20 +41,16 @@ using std::map;
should not be translated according to what UnicodeData says, but should not be translated according to what UnicodeData says, but
instead according to some local rule. There will usually be very instead according to some local rule. There will usually be very
few of them, but they must be looked up for every translated char. few of them, but they must be looked up for every translated char.
We use a sorted vector for fastest elimination by binary search and
a vector<string> to store the translations
*/ */
static vector<unsigned short> except_chars; unordered_map<unsigned short, string> except_trans;
static vector<string> except_trans; static inline bool is_except_char(unsigned short c, string& trans)
static inline size_t is_except_char(unsigned short c)
{ {
vector<unsigned short>::iterator it = unordered_map<unsigned short, string>::const_iterator it
std::lower_bound(except_chars.begin(), except_chars.end(), c); = except_trans.find(c);
if (it == except_chars.end() || *it != c) { if (it == except_trans.end())
return (size_t(-1)); return false;
} trans = it->second;
return std::distance(except_chars.begin(), it); return true;
} }
#endif /* RECOLL_DATADIR */ #endif /* RECOLL_DATADIR */
@ -12715,21 +12711,18 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
// - unaccenting: do nothing (copy original char) // - unaccenting: do nothing (copy original char)
// - unac+fold: use table // - unac+fold: use table
// - fold: use the unicode data. // - fold: use the unicode data.
size_t idx; string trans;
if (what != UNAC_FOLD && except_chars.size() != 0 && if (what != UNAC_FOLD && except_trans.size() != 0 &&
(idx=is_except_char(c)) != (size_t)-1) { is_except_char(c, trans)) {
if (what == UNAC_UNAC) { if (what == UNAC_UNAC) {
// Unaccent only. Do nothing // Unaccent only. Do nothing
p = 0; p = 0;
l = 0; l = 0;
} else { } else {
// Has to be UNAC_UNACFOLD: use table // Has to be UNAC_UNACFOLD: use table
p = (unsigned short *)(except_trans[idx].c_str() + 2); p = (unsigned short *)trans.c_str();
l = (except_trans[idx].size() - 2) / 2; l = trans.size() / 2;
} }
/* if (p) {unsigned char *cp = (unsigned char *)p;
fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0],
(unsigned int)cp[1]);}*/
} else { } else {
#endif /* RECOLL_DATADIR */ #endif /* RECOLL_DATADIR */
unac_uf_char_utf16_(c, p, l, what) unac_uf_char_utf16_(c, p, l, what)
@ -13076,7 +13069,6 @@ const char* unac_version(void)
#ifdef RECOLL_DATADIR #ifdef RECOLL_DATADIR
void unac_set_except_translations(const char *spectrans) void unac_set_except_translations(const char *spectrans)
{ {
except_chars.clear();
except_trans.clear(); except_trans.clear();
if (!spectrans || !spectrans[0]) if (!spectrans || !spectrans[0])
return; return;
@ -13123,14 +13115,8 @@ void unac_set_except_translations(const char *spectrans)
else else
ch = (out[0] << 8) | (out[1] & 0xff); ch = (out[0] << 8) | (out[1] & 0xff);
/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/ except_trans[ch] = string((const char *)(out + 2), outsize-2);
except_chars.push_back(ch);
// We keep ch as the first 2 bytes in the translation so that
// both vectors sort identically
except_trans.push_back(string((const char *)out, outsize));
free(out); free(out);
} }
std::sort(except_chars.begin(), except_chars.end());
std::sort(except_trans.begin(), except_trans.end());
} }
#endif /* RECOLL_DATADIR */ #endif /* RECOLL_DATADIR */

View file

@ -5,20 +5,23 @@
#include <string> #include <string>
#include <set> #include <set>
/** Store about user terms and their expansions. This is used mostly for /** Store data about user search terms and their expansions. This is used
* highlighting result text and walking the matches. * mostly for highlighting result text and walking the matches, generating
* spelling suggestions.
*/ */
struct HighlightData { struct HighlightData {
/** The user terms, excluding those with wildcards. /** The user terms, excluding those with wildcards. This list is
* This list is intended for orthographic suggestions but the terms are * intended for orthographic suggestions so the terms are always
* unaccented lowercased anyway because they are compared to the dictionary * lowercased, unaccented or not depending on the type of index
* generated from the index term list (which is unaccented). * (as the spelling dictionary is generated from the index terms).
*/ */
std::set<std::string> uterms; std::set<std::string> uterms;
/** The original user terms-or-groups. This is for displaying the matched /** The original user terms-or-groups. This is for display
* terms or groups, ie in relation with highlighting or skipping to the * purposes: ie when creating a menu to look for a specific
* next match. These are raw, diacritics and case preserved. * matched group inside a preview window. We want to show the
* user-entered data in the menu, not some transformation, so
* these are always raw, diacritics and case preserved.
*/ */
std::vector<std::vector<std::string> > ugroups; std::vector<std::vector<std::string> > ugroups;
@ -35,7 +38,7 @@ struct HighlightData {
/** Index into ugroups for each group. Parallel to groups. As a /** Index into ugroups for each group. Parallel to groups. As a
* user term or group may generate many processed/expanded terms * user term or group may generate many processed/expanded terms
* or groups, this is how we relate them * or groups, this is how we relate an expansion to its source.
*/ */
std::vector<unsigned int> grpsugidx; std::vector<unsigned int> grpsugidx;

View file

@ -31,9 +31,9 @@
#include <map> #include <map>
#include <string> #include <string>
#include <algorithm> #include <algorithm>
#include <tr1/unordered_map>
using std::string; using std::string;
using std::vector; using std::tr1::unordered_map;
using std::map;
#include "smallut.h" #include "smallut.h"
/* /*
@ -41,20 +41,16 @@ using std::map;
should not be translated according to what UnicodeData says, but should not be translated according to what UnicodeData says, but
instead according to some local rule. There will usually be very instead according to some local rule. There will usually be very
few of them, but they must be looked up for every translated char. few of them, but they must be looked up for every translated char.
We use a sorted vector for fastest elimination by binary search and
a vector<string> to store the translations
*/ */
static vector<unsigned short> except_chars; unordered_map<unsigned short, string> except_trans;
static vector<string> except_trans; static inline bool is_except_char(unsigned short c, string& trans)
static inline size_t is_except_char(unsigned short c)
{ {
vector<unsigned short>::iterator it = unordered_map<unsigned short, string>::const_iterator it
std::lower_bound(except_chars.begin(), except_chars.end(), c); = except_trans.find(c);
if (it == except_chars.end() || *it != c) { if (it == except_trans.end())
return (size_t(-1)); return false;
} trans = it->second;
return std::distance(except_chars.begin(), it); return true;
} }
#endif /* RECOLL_DATADIR */ #endif /* RECOLL_DATADIR */
@ -12715,21 +12711,18 @@ int unacmaybefold_string_utf16(const char* in, size_t in_length,
// - unaccenting: do nothing (copy original char) // - unaccenting: do nothing (copy original char)
// - unac+fold: use table // - unac+fold: use table
// - fold: use the unicode data. // - fold: use the unicode data.
size_t idx; string trans;
if (what != UNAC_FOLD && except_chars.size() != 0 && if (what != UNAC_FOLD && except_trans.size() != 0 &&
(idx=is_except_char(c)) != (size_t)-1) { is_except_char(c, trans)) {
if (what == UNAC_UNAC) { if (what == UNAC_UNAC) {
// Unaccent only. Do nothing // Unaccent only. Do nothing
p = 0; p = 0;
l = 0; l = 0;
} else { } else {
// Has to be UNAC_UNACFOLD: use table // Has to be UNAC_UNACFOLD: use table
p = (unsigned short *)(except_trans[idx].c_str() + 2); p = (unsigned short *)trans.c_str();
l = (except_trans[idx].size() - 2) / 2; l = trans.size() / 2;
} }
/* if (p) {unsigned char *cp = (unsigned char *)p;
fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0],
(unsigned int)cp[1]);}*/
} else { } else {
#endif /* RECOLL_DATADIR */ #endif /* RECOLL_DATADIR */
unac_uf_char_utf16_(c, p, l, what) unac_uf_char_utf16_(c, p, l, what)
@ -13076,7 +13069,6 @@ const char* unac_version(void)
#ifdef RECOLL_DATADIR #ifdef RECOLL_DATADIR
void unac_set_except_translations(const char *spectrans) void unac_set_except_translations(const char *spectrans)
{ {
except_chars.clear();
except_trans.clear(); except_trans.clear();
if (!spectrans || !spectrans[0]) if (!spectrans || !spectrans[0])
return; return;
@ -13123,14 +13115,8 @@ void unac_set_except_translations(const char *spectrans)
else else
ch = (out[0] << 8) | (out[1] & 0xff); ch = (out[0] << 8) | (out[1] & 0xff);
/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/ except_trans[ch] = string((const char *)(out + 2), outsize-2);
except_chars.push_back(ch);
// We keep ch as the first 2 bytes in the translation so that
// both vectors sort identically
except_trans.push_back(string((const char *)out, outsize));
free(out); free(out);
} }
std::sort(except_chars.begin(), except_chars.end());
std::sort(except_trans.begin(), except_trans.end());
} }
#endif /* RECOLL_DATADIR */ #endif /* RECOLL_DATADIR */

View file

@ -86,6 +86,13 @@
<h2>News</h2> <h2>News</h2>
<div class="news"> <div class="news">
<ul> <ul>
<li>2012-09-21: an
<a href="https://bitbucket.org/medoc/recoll/wiki/ElinksBeagle">easy
way</a> to extend the "Beagle queue"
Recoll web history indexing mechanism to other browsers than
Firefox (Elinks in this case).
</li>
<li>2012-09-13: the next Recoll version will maybe acquire switchable <li>2012-09-13: the next Recoll version will maybe acquire switchable
case and diacritics sensitivity. I am writing case and diacritics sensitivity. I am writing
a few pages about the a few pages about the