/* Copyright (C) 2005 J.F.Dockes * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the * Free Software Foundation, Inc., * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include #include #include using std::vector; using std::list; using std::pair; using std::set; #include "rcldb.h" #include "rclconfig.h" #include "debuglog.h" #include "textsplit.h" #include "utf8iter.h" #include "smallut.h" #include "plaintorich.h" #include "cancelcheck.h" #include "unacpp.h" // For debug printing static string vecStringToString(const vector& t) { string sterms; for (vector::const_iterator it = t.begin(); it != t.end(); it++) { sterms += "[" + *it + "] "; } return sterms; } struct MatchEntry { // Start/End byte offsets in the document text pair offs; // Index of the search group this comes from: this is to relate a // match to the original user input. size_t grpidx; MatchEntry(int sta, int sto, size_t idx) : offs(sta, sto), grpidx(idx) { } }; // Text splitter used to take note of the position of query terms // inside the result text. This is then used to insert highlight tags. class TextSplitPTR : public TextSplit { public: // Out: begin and end byte positions of query terms/groups in text vector tboffs; TextSplitPTR(const HighlightData& hdata) : m_wcount(0), m_hdata(hdata) { // We separate single terms and groups and extract the group // terms for computing positions list before looking for group // matches for (vector >::const_iterator vit = hdata.groups.begin(); vit != hdata.groups.end(); vit++) { if (vit->size() == 1) { m_terms[vit->front()] = vit - hdata.groups.begin(); } else if (vit->size() > 1) { for (vector::const_iterator it = vit->begin(); it != vit->end(); it++) { m_gterms.insert(*it); } } } } // Accept word and its position. If word is search term, add // highlight zone definition. If word is part of search group // (phrase or near), update positions list. virtual bool takeword(const std::string& term, int pos, int bts, int bte) { string dumb = term; if (o_index_stripchars) { if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) { LOGINFO(("PlainToRich::takeword: unac failed for [%s]\n", term.c_str())); return true; } } //LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), // pos, bts, bte)); // If this word is a search term, remember its byte-offset span. map::const_iterator it = m_terms.find(dumb); if (it != m_terms.end()) { tboffs.push_back(MatchEntry(bts, bte, (*it).second)); } // If word is part of a search group, update its positions list if (m_gterms.find(dumb) != m_gterms.end()) { // Term group (phrase/near) handling m_plists[dumb].push_back(pos); m_gpostobytes[pos] = pair(bts, bte); //LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte)); } // Check for cancellation request if ((m_wcount++ & 0xfff) == 0) CancelCheck::instance().checkCancel(); return true; } // Must be called after the split to find the phrase/near match positions virtual bool matchGroups(); private: virtual bool matchGroup(unsigned int idx); // Word count. Used to call checkCancel from time to time. int m_wcount; // In: user query terms map m_terms; // m_gterms holds all the terms in m_groups, as a set for quick lookup set m_gterms; const HighlightData& m_hdata; // group/near terms word positions. map > m_plists; map > m_gpostobytes; }; /** Sort by shorter comparison class */ class VecIntCmpShorter { public: /** Return true if and only if a is strictly shorter than b. */ bool operator()(const vector *a, const vector *b) { return a->size() < b->size(); } }; #define SETMINMAX(POS, STA, STO) {if ((POS) < (STA)) (STA) = (POS); \ if ((POS) > (STO)) (STO) = (POS);} // Check that at least an entry from the first position list is inside // the window and recurse on next list. The window is readjusted as // the successive terms are found. // // @param window the search window width // @param plists the position list vector // @param i the position list to process (we then recurse with the next list) // @param min the current minimum pos for a found term // @param max the current maximum pos for a found term // @param sp, ep output: the found area // @param minpos bottom of search: this is the highest point of // any previous match. We don't look below this as overlapping matches // make no sense for highlighting. static bool do_proximity_test(int window, vector* >& plists, unsigned int i, int min, int max, int *sp, int *ep, int minpos) { LOGDEB1(("do_prox_test: win %d i %d min %d max %d minpos %d\n", window, i, min, max, minpos)); int tmp = max + 1 - window; if (tmp < minpos) tmp = minpos; // Find 1st position bigger than window start vector::iterator it = plists[i]->begin(); while (it != plists[i]->end() && *it < tmp) it++; // Look for position inside window. If not found, no match. If // found: if this is the last list we're done, else recurse on // next list after adjusting the window while (it != plists[i]->end()) { int pos = *it; if (pos > min + window - 1) return false; if (i + 1 == plists.size()) { SETMINMAX(pos, *sp, *ep); return true; } SETMINMAX(pos, min, max); if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) { SETMINMAX(pos, *sp, *ep); return true; } it++; } return false; } // Find NEAR matches for one group of terms, update highlight map bool TextSplitPTR::matchGroup(unsigned int grpidx) { const vector& terms = m_hdata.groups[grpidx]; int window = int(m_hdata.groups[grpidx].size() + m_hdata.slacks[grpidx]); LOGDEB1(("TextSplitPTR::matchGroup:d %d: %s\n", window, vecStringToString(terms).c_str())); // The position lists we are going to work with. We extract them from the // (string->plist) map vector* > plists; // A revert plist->term map. This is so that we can find who is who after // sorting the plists by length. map*, string> plistToTerm; // Find the position list for each term in the group. It is // possible that this particular group was not actually matched by // the search, so that some terms are not found. for (vector::const_iterator it = terms.begin(); it != terms.end(); it++) { map >::iterator pl = m_plists.find(*it); if (pl == m_plists.end()) { LOGDEB1(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n", (*it).c_str())); return false; } plists.push_back(&(pl->second)); plistToTerm[&(pl->second)] = *it; } // I think this can't actually happen, was useful when we used to // prune the groups, but doesn't hurt. if (plists.size() < 2) { LOGDEB1(("TextSplitPTR::matchGroup: no actual groups found\n")); return false; } // Sort the positions lists so that the shorter is first std::sort(plists.begin(), plists.end(), VecIntCmpShorter()); { // Debug map*, string>::iterator it; it = plistToTerm.find(plists[0]); if (it == plistToTerm.end()) { // SuperWeird LOGERR(("matchGroup: term for first list not found !?!\n")); return false; } LOGDEB1(("matchGroup: walking the shortest plist. Term [%s], len %d\n", it->second.c_str(), plists[0]->size())); } // Minpos is the highest end of a found match. While looking for // further matches, we don't want the search to extend before // this, because it does not make sense for highlight regions to // overlap int minpos = 0; // Walk the shortest plist and look for matches for (vector::iterator it = plists[0]->begin(); it != plists[0]->end(); it++) { int pos = *it; int sta = INT_MAX, sto = 0; LOGDEB2(("MatchGroup: Testing at pos %d\n", pos)); if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) { LOGDEB1(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n", sta, sto)); // Maybe extend the window by 1st term position, this was not // done by do_prox.. SETMINMAX(pos, sta, sto); minpos = sto+1; // Translate the position window into a byte offset window map >::iterator i1 = m_gpostobytes.find(sta); map >::iterator i2 = m_gpostobytes.find(sto); if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) { LOGDEB2(("TextSplitPTR::matchGroup: pushing bpos %d %d\n", i1->second.first, i2->second.second)); tboffs.push_back(MatchEntry(i1->second.first, i2->second.second, grpidx)); } else { LOGDEB0(("matchGroup: no bpos found for %d or %d\n", sta, sto)); } } else { LOGDEB1(("matchGroup: no group match found at this position\n")); } } return true; } /** Sort integer pairs by increasing first value and decreasing width */ class PairIntCmpFirst { public: bool operator()(const MatchEntry& a, const MatchEntry& b) { if (a.offs.first != b.offs.first) return a.offs.first < b.offs.first; return a.offs.second > b.offs.second; } }; // Look for matches to PHRASE and NEAR term groups and finalize the // matched regions list (sort it by increasing start then decreasing // length) // Actually, we handle all groups as NEAR (ignore order). bool TextSplitPTR::matchGroups() { for (unsigned int i = 0; i < m_hdata.groups.size(); i++) { if (m_hdata.groups[i].size() <= 1) continue; matchGroup(i); } // Sort regions by increasing start and decreasing width. // The output process will skip overlapping entries. std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst()); return true; } // Fix result text for display inside the gui text window. // // We call overridden functions to output header data, beginnings and ends of // matches etc. // // If the input is text, we output the result in chunks, arranging not // to cut in the middle of a tag, which would confuse qtextedit. If // the input is html, the body is always a single output chunk. bool PlainToRich::plaintorich(const string& in, list& out, // Output chunk list const HighlightData& hdata, int chunksize) { Chrono chron; bool ret = true; LOGDEB1(("plaintorichich: in: [%s]\n", in.c_str())); m_hdata = &hdata; // Compute the positions for the query terms. We use the text // splitter to break the text into words, and compare the words to // the search terms, TextSplitPTR splitter(hdata); // Note: the splitter returns the term locations in byte, not // character, offsets. splitter.text_to_words(in); LOGDEB2(("plaintorich: split done %d mS\n", chron.millis())); // Compute the positions for NEAR and PHRASE groups. splitter.matchGroups(); LOGDEB2(("plaintorich: group match done %d mS\n", chron.millis())); out.clear(); out.push_back(""); list::iterator olit = out.begin(); // Rich text output *olit = header(); // No term matches. Happens, for example on a snippet selected for // a term match when we are actually looking for a group match // (the snippet generator does this...). if (splitter.tboffs.empty()) { LOGDEB1(("plaintorich: no term matches\n")); ret = false; } // Iterator for the list of input term positions. We use it to // output highlight tags and to compute term positions in the // output text vector::iterator tPosIt = splitter.tboffs.begin(); vector::iterator tPosEnd = splitter.tboffs.end(); #if 0 for (vector >::const_iterator it = splitter.tboffs.begin(); it != splitter.tboffs.end(); it++) { LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second)); } #endif // Input character iterator Utf8Iter chariter(in); // State variables used to limit the number of consecutive empty lines, // convert all eol to '\n', and preserve some indentation int eol = 0; int hadcr = 0; int inindent = 1; // HTML state bool intag = false, inparamvalue = false; // My tag state int inrcltag = 0; string::size_type headend = 0; if (m_inputhtml) { headend = in.find(""); if (headend == string::npos) headend = in.find(""); if (headend != string::npos) headend += 7; } for (string::size_type pos = 0; pos != string::npos; pos = chariter++) { // Check from time to time if we need to stop if ((pos & 0xfff) == 0) { CancelCheck::instance().checkCancel(); } // If we still have terms positions, check (byte) position. If // we are at or after a term match, mark. if (tPosIt != tPosEnd) { size_t ibyteidx = chariter.getBpos(); if (ibyteidx == tPosIt->offs.first) { if (!intag && ibyteidx >= (int)headend) { *olit += startMatch((unsigned int)(tPosIt->grpidx)); } inrcltag = 1; } else if (ibyteidx == tPosIt->offs.second) { // Output end of match region tags if (!intag && ibyteidx > (int)headend) { *olit += endMatch(); } // Skip all highlight areas that would overlap this one int crend = tPosIt->offs.second; while (tPosIt != splitter.tboffs.end() && tPosIt->offs.first < crend) tPosIt++; inrcltag = 0; } } unsigned int car = *chariter; if (car == '\n') { if (!hadcr) eol++; hadcr = 0; continue; } else if (car == '\r') { hadcr++; eol++; continue; } else if (eol) { // Got non eol char in line break state. Do line break; inindent = 1; hadcr = 0; if (eol > 2) eol = 2; while (eol) { if (!m_inputhtml && m_eolbr) *olit += "
"; *olit += "\n"; eol--; } // Maybe end this chunk, begin next. Don't do it on html // there is just no way to do it right (qtextedit cant grok // chunks cut in the middle of for example). if (!m_inputhtml && !inrcltag && olit->size() > (unsigned int)chunksize) { out.push_back(string(startChunk())); olit++; } } switch (car) { case '<': inindent = 0; if (m_inputhtml) { if (!inparamvalue) intag = true; chariter.appendchartostring(*olit); } else { *olit += "<"; } break; case '>': inindent = 0; if (m_inputhtml) { if (!inparamvalue) intag = false; } chariter.appendchartostring(*olit); break; case '&': inindent = 0; if (m_inputhtml) { chariter.appendchartostring(*olit); } else { *olit += "&"; } break; case '"': inindent = 0; if (m_inputhtml && intag) { inparamvalue = !inparamvalue; } chariter.appendchartostring(*olit); break; case ' ': if (m_eolbr && inindent) { *olit += " "; } else { chariter.appendchartostring(*olit); } break; case '\t': if (m_eolbr && inindent) { *olit += "    "; } else { chariter.appendchartostring(*olit); } break; default: inindent = 0; chariter.appendchartostring(*olit); } } // End chariter loop #if 0 { FILE *fp = fopen("/tmp/debugplaintorich", "a"); fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n"); for (list::iterator it = out.begin(); it != out.end(); it++) { fprintf(fp, "BEGINOFPLAINTORICHCHUNK\n"); fprintf(fp, "%s", it->c_str()); fprintf(fp, "ENDOFPLAINTORICHCHUNK\n"); } fprintf(fp, "ENDOFPLAINTORICHOUTPUT\n"); fclose(fp); } #endif LOGDEB2(("plaintorich: done %d mS\n", chron.millis())); return ret; }