1
0
Fork 0
mirror of https://github.com/Yetangitu/owncloud-apps.git synced 2025-10-02 14:49:17 +02:00

Improved ISBN regexes, again

This commit is contained in:
frankdelange 2014-12-22 14:29:43 +01:00
parent 84b7c25638
commit e85212bb66
3 changed files with 32 additions and 33 deletions

BIN
dist/files_opds-0.5.2.tar.gz vendored Normal file

Binary file not shown.

View file

@ -4,7 +4,7 @@
<name>OPDS catalog</name>
<description>Personal OPDS catalog</description>
<licence>AGPL</licence>
<version>0.5.1</version>
<version>0.5.2</version>
<author>Frank de Lange</author>
<requiremin>7.0</requiremin>
<shipped>true</shipped>

View file

@ -35,38 +35,8 @@ class Isbn
public static function scan($text) {
$match = array();
foreach($text as $line) {
/* generic ISBN 10/13 pattern */
if(preg_match_all('/ISBN(?:[ -]?[1[03]]?)?:?\s*((97[89])?[X0-9-]{10,14})/i',$line,$match)) {
foreach($match[1] as $hit) {
$hit = preg_replace('/[^0-9X]/i','',$hit);
if(self::validate($hit)) {
return $hit;
}
}
}
/* single ISBN-13 targeted pattern (canonical format) */
if(preg_match_all('/(97[89][ -]?\d[ -]?\d{4}[ -]?\d{4}[ -]?\d)/',$line,$match)) {
foreach($match[1] as $hit) {
$hit = preg_replace('/[^0-9]/','',$hit);
if(self::validate($hit)) {
return $hit;
}
}
}
/* single ISBN-13 targeted pattern (free format) */
if(preg_match_all('/(9[\d -]{11,15}\d)/',$line,$match)) {
foreach($match[1] as $hit) {
$hit = preg_replace('/[^0-9]/','',$hit);
if(self::validate($hit)) {
return $hit;
}
}
}
/* single ISBN-10 targeted pattern */
if(preg_match_all('/(\d[\d -]{8,11}[\dX])/i',$line,$match)) {
/* generic ISBN 10/13 pattern. Checks for unicode dashes ('‒–—―‑‐﹣--') as well as regular hyphens. */
if(preg_match_all('/ISBN(?:[‒–—―‑‐﹣--]?(?:1[03])?)?:?\s*(?=[\d—―-]{10,17})(((?:97[89])[0-9—―-]{9,14})|([\d—―-]{9,12}[\dXx]))/u', $line, $match)) {
foreach($match[1] as $hit) {
$hit = preg_replace('/[^0-9X]/i','',$hit);
if(self::validate($hit)) {
@ -76,6 +46,35 @@ class Isbn
}
}
/* If nothing found, try prefix-less versions. Even though ISBN numbers should be
* presented with a 'ISBN' prefix, some publications omit this. These patterns
* are liable to generate false positives, so they should only be run after the
* prefixed version has exhausted the search without returning results.
*/
foreach($text as $line) {
/* prefix-less ISBN-13 targeted pattern */
if(preg_match_all('/(97[89][\d—―-]\d{9,13}\d)/u',$line,$match)) {
foreach($match[1] as $hit) {
$hit = preg_replace('/[^0-9]/','',$hit);
if(self::validate($hit)) {
return $hit;
}
}
}
/* single ISBN-10 targeted pattern */
if(preg_match_all('/(\d[\d—―-]{8,11}[\dXx])/u',$line,$match)) {
foreach($match[1] as $hit) {
$hit = preg_replace('/[^0-9X]/i','',$hit);
if(self::validate($hit)) {
return $hit;
}
}
}
}
/* No ISBN found */
return false;
}