files_reade: working towards working search, W.I.P.

2025-10-02 14:49:17 +02:00 · 2017-03-29 15:42:27 +02:00 · 2017-03-29 15:42:27 +02:00 · 4b31f04330
commit 4b31f04330
parent 2dd54dd028
14 changed files with 580 additions and 173 deletions
--- a/files_reader/vendor/pdfjs/controllers/search_controller.js
+++ b/files_reader/vendor/pdfjs/controllers/search_controller.js
@ -1,11 +1,36 @@
+var FindStates = {
+    FIND_FOUND: 0,
+    FIND_NOTFOUND: 1,
+    FIND_WRAPPED: 2,
+    FIND_PENDING: 3
+};
+
+var FIND_SCROLL_OFFSET_TOP = -50;
+var FIND_SCROLL_OFFSET_LEFT = -400;
+
+var CHARACTERS_TO_NORMALIZE = {
+    '\u2018': '\'', // Left single quotation mark
+    '\u2019': '\'', // Right single quotation mark
+    '\u201A': '\'', // Single low-9 quotation mark
+    '\u201B': '\'', // Single high-reversed-9 quotation mark
+    '\u201C': '"', // Left double quotation mark
+    '\u201D': '"', // Right double quotation mark
+    '\u201E': '"', // Double low-9 quotation mark
+    '\u201F': '"', // Double high-reversed-9 quotation mark
+    '\u00BC': '1/4', // Vulgar fraction one quarter
+    '\u00BD': '1/2', // Vulgar fraction one half
+    '\u00BE': '3/4', // Vulgar fraction three quarters
+};
+
 PDFJS.reader.SearchController = function () {
+
    var reader = this,
        book = this.book,
        query = "";

    var $searchBox = $("#searchBox"),
        $clearBtn = $("#searchBox").next(),
-        $clear_search = $("#clear_search"), 
+        $clear_search = $("#clear_search"),
        $searchResults = $("#searchResults"),
        $searchView = $("#searchView"),
        $body = $("#viewer iframe").contents().find('body'),
@ -21,6 +46,456 @@ PDFJS.reader.SearchController = function () {
        $searchView.removeClass("open");
    };

+    this.onUpdateResultsCount = null;
+    this.onUpdateState = null;
+
+    // Compile the regular expression for text normalization once.
+    var replace = Object.keys(CHARACTERS_TO_NORMALIZE).join('');
+    this.normalizationRegex = new RegExp('[' + replace + ']', 'g');
+
+    var reset = function () {
+        this.startedTextExtraction = false;
+        this.extractTextPromises = [];
+        this.pendingFindMatches = Object.create(null);
+        this.active = false; // If active, find results will be highlighted.
+        this.pageContents = []; // Stores the text for each page.
+        this.pageMatches = [];
+        this.pageMatchesLength = null;
+        this.matchCount = 0;
+        this.selected = { // Currently selected match.
+            pageIdx: -1,
+            matchIdx: -1
+        };
+        this.offset = { // Where the find algorithm currently is in the document.
+            pageIdx: null,
+            matchIdx: null
+        };
+        this.pagesToSearch = null;
+        this.resumePageIdx = null;
+        this.state = null;
+        this.dirtyMatch = false;
+        this.findTimeout = null;
+    };
+
+    reset();
+
+
+    var normalize = function (text) {
+        return text.replace(this.normalizationRegex, function (ch) {
+            return CHARACTERS_TO_NORMALIZE[ch];
+        });
+    };
+
+    // Helper for multiple search - fills matchesWithLength array
+    // and takes into account cases when one search term
+    // include another search term (for example, "tamed tame" or "this is").
+    // Looking for intersecting terms in the 'matches' and
+    // leave elements with a longer match-length.
+
+    var _prepareMatches = function (
+        matchesWithLength, matches, matchesLength) {
+
+            function isSubTerm(matchesWithLength, currentIndex) {
+                var currentElem, prevElem, nextElem;
+                currentElem = matchesWithLength[currentIndex];
+                nextElem = matchesWithLength[currentIndex + 1];
+                // checking for cases like "TAMEd TAME"
+                if (currentIndex < matchesWithLength.length - 1 &&
+                    currentElem.match === nextElem.match) {
+                        currentElem.skipped = true;
+                        return true;
+                    }
+                // checking for cases like "thIS IS"
+                for (var i = currentIndex - 1; i >= 0; i--) {
+                    prevElem = matchesWithLength[i];
+                    if (prevElem.skipped) {
+                        continue;
+                    }
+                    if (prevElem.match + prevElem.matchLength < currentElem.match) {
+                        break;
+                    }
+                    if (prevElem.match + prevElem.matchLength >=
+                        currentElem.match + currentElem.matchLength) {
+                            currentElem.skipped = true;
+                            return true;
+                        }
+                }
+                return false;
+            }
+
+            var i, len;
+            // Sorting array of objects { match: <match>, matchLength: <matchLength> }
+            // in increasing index first and then the lengths.
+            matchesWithLength.sort(function(a, b) {
+                return a.match === b.match ?
+                    a.matchLength - b.matchLength : a.match - b.match;
+            });
+            for (i = 0, len = matchesWithLength.length; i < len; i++) {
+                if (isSubTerm(matchesWithLength, i)) {
+                    continue;
+                }
+                matches.push(matchesWithLength[i].match);
+                matchesLength.push(matchesWithLength[i].matchLength);
+            }
+        };
+
+    var calcFindPhraseMatch = function (
+        query, pageIndex, pageContent) {
+            var matches = [];
+            var queryLen = query.length;
+            var matchIdx = -queryLen;
+            while (true) {
+                matchIdx = pageContent.indexOf(query, matchIdx + queryLen);
+                if (matchIdx === -1) {
+                    break;
+                }
+                matches.push(matchIdx);
+            }
+            this.pageMatches[pageIndex] = matches;
+        };
+
+    var calcFindWordMatch = function (
+        query, pageIndex, pageContent) {
+            var matchesWithLength = [];
+            // Divide the query into pieces and search for text on each piece.
+            var queryArray = query.match(/\S+/g);
+            var subquery, subqueryLen, matchIdx;
+            for (var i = 0, len = queryArray.length; i < len; i++) {
+                subquery = queryArray[i];
+                subqueryLen = subquery.length;
+                matchIdx = -subqueryLen;
+                while (true) {
+                    matchIdx = pageContent.indexOf(subquery, matchIdx + subqueryLen);
+                    if (matchIdx === -1) {
+                        break;
+                    }
+                    // Other searches do not, so we store the length.
+                    matchesWithLength.push({
+                        match: matchIdx,
+                        matchLength: subqueryLen,
+                        skipped: false
+                    });
+                }
+            }
+            // Prepare arrays for store the matches.
+            if (!this.pageMatchesLength) {
+                this.pageMatchesLength = [];
+            }
+            this.pageMatchesLength[pageIndex] = [];
+            this.pageMatches[pageIndex] = [];
+            // Sort matchesWithLength, clean up intersecting terms
+            // and put the result into the two arrays.
+            _prepareMatches(matchesWithLength, this.pageMatches[pageIndex],
+                this.pageMatchesLength[pageIndex]);
+        };
+
+    var calcFindMatch = function (pageIndex) {
+        var pageContent = normalize(this.pageContents[pageIndex]);
+        var query = normalize(this.state.query);
+        var caseSensitive = this.state.caseSensitive;
+        var phraseSearch = this.state.phraseSearch;
+        var queryLen = query.length;
+
+        if (queryLen === 0) {
+            // Do nothing: the matches should be wiped out already.
+            return;
+        }
+
+        if (!caseSensitive) {
+            pageContent = pageContent.toLowerCase();
+            query = query.toLowerCase();
+        }
+
+        if (phraseSearch) {
+            calcFindPhraseMatch(query, pageIndex, pageContent);
+        } else {
+            calcFindWordMatch(query, pageIndex, pageContent);
+        }
+
+        updatePage(pageIndex);
+        if (this.resumePageIdx === pageIndex) {
+            this.resumePageIdx = null;
+            nextPageMatch();
+        }
+
+        // Update the matches count
+        if (this.pageMatches[pageIndex].length > 0) {
+            this.matchCount += this.pageMatches[pageIndex].length;
+            updateUIResultsCount();
+        }
+    };
+
+    var extractText = function () {
+
+        if (this.startedTextExtraction) {
+            return;
+        }
+        this.startedTextExtraction = true;
+
+        this.pageContents = [];
+        var extractTextPromisesResolves = [];
+        var numPages = reader.settings.numPages;
+        for (var i = 0; i < numPages; i++) {
+            this.extractTextPromises.push(new Promise(function (resolve) {
+                extractTextPromisesResolves.push(resolve);
+            }));
+        }
+
+        var self = this;
+        function extractPageText(pageIndex) {
+            reader.getPageTextContent(pageIndex).then(
+                function textContentResolved(textContent) {
+                    var textItems = textContent.items;
+                    var str = [];
+
+                    for (var i = 0, len = textItems.length; i < len; i++) {
+                        str.push(textItems[i].str);
+                    }
+
+                    // Store the pageContent as a string.
+                    self.pageContents.push(str.join(''));
+
+                    extractTextPromisesResolves[pageIndex](pageIndex);
+                    if ((pageIndex + 1) < reader.settings.numPages) {
+						console.log("extracting text from page " + parseInt(pageIndex + 1));
+                        extractPageText(pageIndex + 1);
+                    } else {
+						console.log("finished extracting text");
+                        for (var i=0;i < reader.settings.numPages;i++) {
+                            console.log("PAGE: " + parseInt(i + 1));
+                            console.log(self.pageContents[i]);	
+                        }
+					}
+                }
+            );
+        }
+        extractPageText(0);
+    };
+
+    var executeCommand = function (cmd, state) {
+        if (this.state === null || cmd !== 'findagain') {
+            this.dirtyMatch = true;
+        }
+        this.state = state;
+        updateUIState(FindStates.FIND_PENDING);
+
+		console.log("execute command ", cmd, " with state ", state);
+
+        reader.firstPagePromise.then(function() {
+            extractText();
+
+            clearTimeout(this.findTimeout);
+            if (cmd === 'find') {
+                // Only trigger the find action after 250ms of silence.
+                this.findTimeout = setTimeout(nextMatch.bind(this), 250);
+            } else {
+                nextMatch();
+            }
+        }.bind(this));
+    };
+
+    var updatePage = function (index) {
+
+        if (this.selected.pageIdx === index) {
+            // If the page is selected, scroll the page into view, which triggers
+            // rendering the page, which adds the textLayer. Once the textLayer is
+            // build, it will scroll onto the selected match.
+            reader.settings.currentPage = index + 1;
+        }
+
+        //var page = this.pdfViewer.getPageView(index);
+        //if (page.textLayer) {
+            //    page.textLayer.updateMatches();
+            //}
+    };
+
+    var nextMatch = function () {
+
+        var previous = this.state.findPrevious;
+        var currentPageIndex = reader.settings.currentPage - 1;
+        var numPages = reader.settings.numPages;
+
+        this.active = true;
+
+        if (this.dirtyMatch) {
+            // Need to recalculate the matches, reset everything.
+            this.dirtyMatch = false;
+            this.selected.pageIdx = this.selected.matchIdx = -1;
+            this.offset.pageIdx = currentPageIndex;
+            this.offset.matchIdx = null;
+            this.hadMatch = false;
+            this.resumePageIdx = null;
+            this.pageMatches = [];
+            this.matchCount = 0;
+            this.pageMatchesLength = null;
+            var self = this;
+
+            for (var i = 0; i < numPages; i++) {
+                // Wipe out any previous highlighted matches.
+                updatePage(i);
+
+                // As soon as the text is extracted start finding the matches.
+                if (!(i in this.pendingFindMatches)) {
+                    this.pendingFindMatches[i] = true;
+                    this.extractTextPromises[i].then(function(pageIdx) {
+                        delete self.pendingFindMatches[pageIdx];
+                        calcFindMatch(pageIdx);
+                    });
+                }
+            }
+        }
+
+        // If there's no query there's no point in searching.
+        if (this.state.query === '') {
+            updateUIState(FindStates.FIND_FOUND);
+            return;
+        }
+
+        // If we're waiting on a page, we return since we can't do anything else.
+        if (this.resumePageIdx) {
+            return;
+        }
+
+        var offset = this.offset;
+        // Keep track of how many pages we should maximally iterate through.
+        this.pagesToSearch = numPages;
+        // If there's already a matchIdx that means we are iterating through a
+        // page's matches.
+        if (offset.matchIdx !== null) {
+            var numPageMatches = this.pageMatches[offset.pageIdx].length;
+            if ((!previous && offset.matchIdx + 1 < numPageMatches) ||
+                (previous && offset.matchIdx > 0)) {
+                    // The simple case; we just have advance the matchIdx to select
+                    // the next match on the page.
+                    this.hadMatch = true;
+                    offset.matchIdx = (previous ? offset.matchIdx - 1 :
+                        offset.matchIdx + 1);
+                    updateMatch(true);
+                    return;
+                }
+            // We went beyond the current page's matches, so we advance to
+            // the next page.
+            advanceOffsetPage(previous);
+        }
+        // Start searching through the page.
+        nextPageMatch();
+    };
+
+    var matchesReady = function (matches) {
+        var offset = this.offset;
+        var numMatches = matches.length;
+        var previous = this.state.findPrevious;
+
+        if (numMatches) {
+            // There were matches for the page, so initialize the matchIdx.
+            this.hadMatch = true;
+            offset.matchIdx = (previous ? numMatches - 1 : 0);
+            updateMatch(true);
+            return true;
+        }
+        // No matches, so attempt to search the next page.
+        advanceOffsetPage(previous);
+        if (offset.wrapped) {
+            offset.matchIdx = null;
+            if (this.pagesToSearch < 0) {
+                // No point in wrapping again, there were no matches.
+                updateMatch(false);
+                // while matches were not found, searching for a page
+                // with matches should nevertheless halt.
+                return true;
+            }
+        }
+        // Matches were not found (and searching is not done).
+        return false;
+    };
+
+    /**
+        * The method is called back from the text layer when match presentation
+        * is updated.
+        * @param {number} pageIndex - page index.
+        * @param {number} index - match index.
+        * @param {Array} elements - text layer div elements array.
+        * @param {number} beginIdx - start index of the div array for the match.
+        */
+        var updateMatchPosition = function (
+            pageIndex, index, elements, beginIdx) {
+                if (this.selected.matchIdx === index &&
+                    this.selected.pageIdx === pageIndex) {
+                        //var spot = {
+                            //    top: FIND_SCROLL_OFFSET_TOP,
+                            //    left: FIND_SCROLL_OFFSET_LEFT
+                            //};
+                        //scrollIntoView(elements[beginIdx], spot,
+                            //        /* skipOverflowHiddenElements = */ true);
+                    }
+                console.log("would scroll into view here except for the fact that Reader is a non-scrolling reader...");
+            };
+
+    var nextPageMatch = function () {
+        if (this.resumePageIdx !== null) {
+            console.error('There can only be one pending page.');
+        }
+        do {
+            var pageIdx = this.offset.pageIdx;
+            var matches = this.pageMatches[pageIdx];
+            if (!matches) {
+                // The matches don't exist yet for processing by "matchesReady",
+                // so set a resume point for when they do exist.
+                this.resumePageIdx = pageIdx;
+                break;
+            }
+        } while (!matchesReady(matches));
+    };
+
+    var advanceOffsetPage = function (previous) {
+        var offset = this.offset;
+        var numPages = this.extractTextPromises.length;
+        offset.pageIdx = (previous ? offset.pageIdx - 1 : offset.pageIdx + 1);
+        offset.matchIdx = null;
+
+        this.pagesToSearch--;
+
+        if (offset.pageIdx >= numPages || offset.pageIdx < 0) {
+            offset.pageIdx = (previous ? numPages - 1 : 0);
+            offset.wrapped = true;
+        }
+    };
+
+    var updateMatch = function (found) {
+        var state = FindStates.FIND_NOTFOUND;
+        var wrapped = this.offset.wrapped;
+        this.offset.wrapped = false;
+
+        if (found) {
+            var previousPage = this.selected.pageIdx;
+            this.selected.pageIdx = this.offset.pageIdx;
+            this.selected.matchIdx = this.offset.matchIdx;
+            state = (wrapped ? FindStates.FIND_WRAPPED : FindStates.FIND_FOUND);
+            // Update the currently selected page to wipe out any selected matches.
+            if (previousPage !== -1 && previousPage !== this.selected.pageIdx) {
+                updatePage(previousPage);
+            }
+        }
+
+        updateUIState(state, this.state.findPrevious);
+        if (this.selected.pageIdx !== -1) {
+            updatePage(this.selected.pageIdx);
+        }
+    };
+
+    var updateUIResultsCount = function () {
+        if (this.onUpdateResultsCount) {
+            onUpdateResultsCount(this.matchCount);
+        }
+    };
+
+    var updateUIState = function (state, previous) {
+        if (this.onUpdateState) {
+            onUpdateState(state, previous, this.matchCount);
+        }
+    };
+
+
    var search = function(q) {
        if (q === undefined) {
            q = $searchBox.val();
@ -38,7 +513,9 @@ PDFJS.reader.SearchController = function () {

        reader.SearchController.query = q;

-        runQuery(q, $searchResults[0]);
+        //runQuery(q, $searchResults[0]);
+
+		executeCommand('find', {query: q});

    };

@ -84,72 +561,11 @@ PDFJS.reader.SearchController = function () {
        book.off("renderer:chapterDisplayed", highlightQuery);
    };

-    // perform search and build result list
-    var runQuery = function(query, element) {
-
-        return new Promise(function(resolve, reject) {
-
-            var results = [];
-
-            for (var i = 0; i < book.spine.length; i++) {
-                var spineItem = book.spine[i];
-                results.push(new Promise(function(resolve, reject) {
-                    new Promise(function(resolve, reject) {
-                        resolve(new PDFJS.Chapter(spineItem, book.store, book.credentials));
-                    }).then(function(chapter) {
-                        return new Promise(function(resolve, reject) {
-                            chapter.load().then(function() {
-                                resolve(chapter);
-                            }).catch(reject);
-                        });
-                    }).then(function(chapter) {
-                        return Promise.resolve(chapter.find(query));
-                    }).then(function(result) {
-                        resolve(result);
-                    });
-                }));
-            }
-            Promise.all(results).then(function(results) {
-                return new Promise(function(resolve, reject) {
-                    resolve(results);
-                    var mergedResults = [].concat.apply([], results);
-                    element.innerHTML = "";
-                    for (var i = 0; i < mergedResults.length; i++) {
-                        try {
-                            var listitem = document.createElement("li");
-                            var link = document.createElement("a");
-                            listitem.classList.add("list_item");
-                            listitem.id = "search-"+i;
-                            link.href=mergedResults[i].cfi;
-                            link.textContent = mergedResults[i].excerpt;
-                            link.classList.add("toc_link");
-                            link.addEventListener("click", function(e) {
-                                e.preventDefault();
-                                book.gotoCfi(this.getAttribute("href"));
-                                $searchResults.find(".list_item")
-                                    .removeClass("currentChapter");
-                                $(this).parent("li").addClass("currentChapter");
-                                $(this).data('query', query);
-                                book.on("renderer:chapterDisplayed", highlightQuery);
-                            });
-                            listitem.appendChild(link);
-                            element.appendChild(listitem);
-                        } catch (e) {
-                            console.warn(e);
-                        }
-                    }
-                });
-            });
-        });
-    };
-

    return {
-        "show"  : onShow,
-        "hide"  : onHide,
+        "show": onShow,
+        "hide": onHide,
        "search": search,
-        "query" : query,
-        "clear" : clear,
-        "unhighlight"   : unhighlight
+        "executeCommand": executeCommand
    };
 };