diff --git a/src/filters/rclmpdf.py b/src/filters/rclmpdf.py index 66beec4a..91514a49 100755 --- a/src/filters/rclmpdf.py +++ b/src/filters/rclmpdf.py @@ -233,11 +233,8 @@ class PDFExtractor: inbody = False didcs = False output = b'' - cont = b'' isempty = True for line in input.split(b'\n'): - line = cont + line - cont = b'' if re.search(b'', line): inheader = False if re.search(b'', line): @@ -264,17 +261,11 @@ class PDFExtractor: s = line[0:1] if s != "\x0c" and s != "<": isempty = False - - # Remove end-of-line hyphenation. It's not clear that - # we should do this as pdftotext without the -layout - # option does it ? - #if re.search(r'[-]$', line): - #m = re.search(r'(.*)[ \t]([^ \t]+)$', line) - #if m: - #line = m.group(1) - #cont = m.group(2).rstrip('-') + # We used to remove end-of-line hyphenation (and join + # lines), but but it's not clear that we should do + # this as pdftotext without the -layout option does it ? line = self.em.htmlescape(line) - + if re.search(b'
', line): inheader = True if re.search(b'', line): diff --git a/src/sampleconf/mimemap b/src/sampleconf/mimemap index d52c11c2..9c8648f8 100644 --- a/src/sampleconf/mimemap +++ b/src/sampleconf/mimemap @@ -150,6 +150,9 @@ .mkv = video/x-matroska .ogv = video/ogg +.flv = video/x-flv +.mp4 = video/mp4 +.ts = video/MP2T .png = image/png .jp2 = image/jp2 diff --git a/website/perfs.html b/website/perfs.html index ad7312c0..b821cc96 100644 --- a/website/perfs.html +++ b/website/perfs.html @@ -43,9 +43,10 @@ store an index. Obviously, your data set will never fit one of the samples, so the results cannot be exactly predicted. -The following data was obtained on a machine with a 1800 Mhz +
The following very old data was obtained on a machine with a + 1800 Mhz AMD Duron CPU, 768Mb of Ram, and a 7200 RPM 160 GBytes IDE - disk, running Suse 10.1.
+ disk, running Suse 10.1. More recent data follows.recollindex (version 1.8.2 with xapian 1.0.0) is executed with the default flush threshold value. @@ -106,8 +107,74 @@ performance degradation. The resulting index is bigger though, the exact reason is not known to me, possibly because of additional fragmentation
+ +There is more recent performance data (2012) at the end of + the article about + converting Recoll indexing to multithreading
+ +Update, March 2016: I took another sample of PDF performance + data on a more modern machine, with Recoll multithreading turned + on. The machine has an Intel Core I7-4770T Cpu, which has 4 + physical cores, and supports hyper-threading for a total of 8 + threads, 8 GBytes of RAM, and SSD storage (incidentally the PC is + fanless, this is not a "beast" computer).
+ +
Data | +Data size | +Indexing time | +Index size | +Peak process memory usage | +
---|---|---|---|---|
Random pdfs harvested on Google + Recoll 1.21.5, idxflushmb set to 200, thread + parameters 6/4/1 |
+ 11 GB, 5320 files | +3 mn 15 S | +400 MB | +545 MB | +
The indexing process used 21 mn of CPU during these 3mn15 of + real time, we are not letting these cores stay idle + much... The improvement compared to the numbers above is quite + spectacular (a factor of 11, approximately), mostly due to the + multiprocessing, but also to the faster CPU and the SSD + storage. Note that the peak memory value is for the + recollindex process, and does not take into account the + multiple Python and pdftotext instances (which are relatively + small but things add up...).
+ +I think + that the following multi-step approach has a good chance to + improve performance: +
At some point, the index writing may become the + bottleneck. As far as I can think, the only possible approach + then is to partition the index.
+