diff --git a/src/filters/rclmpdf.py b/src/filters/rclmpdf.py index 66beec4a..91514a49 100755 --- a/src/filters/rclmpdf.py +++ b/src/filters/rclmpdf.py @@ -233,11 +233,8 @@ class PDFExtractor: inbody = False didcs = False output = b'' - cont = b'' isempty = True for line in input.split(b'\n'): - line = cont + line - cont = b'' if re.search(b'', line): inheader = False if re.search(b'', line): @@ -264,17 +261,11 @@ class PDFExtractor: s = line[0:1] if s != "\x0c" and s != "<": isempty = False - - # Remove end-of-line hyphenation. It's not clear that - # we should do this as pdftotext without the -layout - # option does it ? - #if re.search(r'[-]$', line): - #m = re.search(r'(.*)[ \t]([^ \t]+)$', line) - #if m: - #line = m.group(1) - #cont = m.group(2).rstrip('-') + # We used to remove end-of-line hyphenation (and join + # lines), but but it's not clear that we should do + # this as pdftotext without the -layout option does it ? line = self.em.htmlescape(line) - + if re.search(b'', line): inheader = True if re.search(b'
', line):
diff --git a/src/sampleconf/mimemap b/src/sampleconf/mimemap
index d52c11c2..9c8648f8 100644
--- a/src/sampleconf/mimemap
+++ b/src/sampleconf/mimemap
@@ -150,6 +150,9 @@
 
 .mkv = video/x-matroska
 .ogv = video/ogg
+.flv = video/x-flv
+.mp4 = video/mp4
+.ts = video/MP2T
 
 .png = image/png
 .jp2 = image/jp2
diff --git a/website/perfs.html b/website/perfs.html
index ad7312c0..b821cc96 100644
--- a/website/perfs.html
+++ b/website/perfs.html
@@ -43,9 +43,10 @@
 	store an index. Obviously, your data set will never fit one of
 	the samples, so the results cannot be exactly predicted.

-

The following data was obtained on a machine with a 1800 Mhz +

The following very old data was obtained on a machine with a + 1800 Mhz AMD Duron CPU, 768Mb of Ram, and a 7200 RPM 160 GBytes IDE - disk, running Suse 10.1.

+ disk, running Suse 10.1. More recent data follows.

recollindex (version 1.8.2 with xapian 1.0.0) is executed with the default flush threshold value. @@ -106,8 +107,74 @@ performance degradation. The resulting index is bigger though, the exact reason is not known to me, possibly because of additional fragmentation

+ +

There is more recent performance data (2012) at the end of + the article about + converting Recoll indexing to multithreading

+ +

Update, March 2016: I took another sample of PDF performance + data on a more modern machine, with Recoll multithreading turned + on. The machine has an Intel Core I7-4770T Cpu, which has 4 + physical cores, and supports hyper-threading for a total of 8 + threads, 8 GBytes of RAM, and SSD storage (incidentally the PC is + fanless, this is not a "beast" computer).

+ + + + + + + + + + + + + + + + + + + +
DataData sizeIndexing timeIndex sizePeak process memory usage
Random pdfs harvested on Google
+ Recoll 1.21.5, idxflushmb set to 200, thread + parameters 6/4/1
11 GB, 5320 files3 mn 15 S400 MB545 MB
+ +

The indexing process used 21 mn of CPU during these 3mn15 of + real time, we are not letting these cores stay idle + much... The improvement compared to the numbers above is quite + spectacular (a factor of 11, approximately), mostly due to the + multiprocessing, but also to the faster CPU and the SSD + storage. Note that the peak memory value is for the + recollindex process, and does not take into account the + multiple Python and pdftotext instances (which are relatively + small but things add up...).

+ +
Improving indexing performance with hardware:
+

I think + that the following multi-step approach has a good chance to + improve performance: +

+

At some point, the index writing may become the + bottleneck. As far as I can think, the only possible approach + then is to partition the index.

+