diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index 082ab2ee5..f5a264a84 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -439,7 +439,7 @@ class PhraseParser
/**
* Splits string according to punctuation and white space then
* extracts (stems/char grams) of terms and n word grams from the string
- * Uses a notiona of maximal n word gram to dot eh extraction
+ * Uses a notion a of maximal n word gram to do the extraction
*
* @param string $string to extract terms from
* @param string $lang IANA tag to look up stemmer under
@@ -452,34 +452,33 @@ class PhraseParser
$lang = null, $extract_sentences = false)
{
$pos_lists = [];
- $maximal_phrases = [];
$terms = self::stemCharGramSegment($string, $lang);
- if ($terms == []) {
+ if (empty($terms)) {
return [];
}
if (C\SUFFIX_PHRASES == 'true') {
$suffix_tree = new SuffixTree($terms);
- $suffix_tree->outputMaximal(1, "", 0, $maximal_phrases);
+ $suffix_tree->outputMaximal(1, "", 0, $pos_lists);
}
- $t = 0;
- $seen = [];
+ $t = 1; /*first position in doc is 1 as will encode with modified9
+ which requires positive numbers
+ */
// add all single terms
foreach ($terms as $term) {
- if (!isset($seen[$term])) {
- $seen[$term] = [];
- $maximal_phrases[$term] = [];
+ if (!isset($pos_lists[$term])) {
+ $pos_lists[$term] = [];
}
- $maximal_phrases[$term][] = $t;
+ $pos_lists[$term][] = $t;
$t++;
}
- $out["TERMS_AND_PHRASES"] = $maximal_phrases;
+ $out["TERMS_AND_PHRASES"] = $pos_lists;
$tokenizer = self::getTokenizer($lang);
if ($extract_sentences && method_exists($tokenizer,
"tagTokenizePartOfSpeech") &&
!isset(self::$programming_language_map[$lang])) {
$string = mb_strtolower($string);
$pre_sentences = preg_split("/(\n\n+)|\.|\!|\?|。/u", $string);
- $pos = 0;
+ $pos = 1;
$sentences_pos = [];
$sentences = [];
foreach ($pre_sentences as $pre_sentence) {
diff --git a/src/library/Utility.php b/src/library/Utility.php
index 4eb6a62cd..16a3e6226 100755
--- a/src/library/Utility.php
+++ b/src/library/Utility.php
@@ -388,7 +388,10 @@ function deDeltaList(&$delta_list)
/**
* Mini-class (so not own file) used to hold encode decode info related to
- * Mod9 encoding (as variant of Simplified-9 specify to Yioop)
+ * Mod9 encoding (as variant of Simplified-9 specify to Yioop).
+ * Mod9 is used to incode a sequence of positive (greater than 0) integers
+ * as a string. WARNING: do not expect is to work/decode correctly if
+ * sequence has a 0 as the decoding process assumes 0 indicates end of sequence.
* @see encodeModified9 for a complete description
*/
class Mod9Constants
@@ -433,7 +436,7 @@ class Mod9Constants
}
/**
* Encodes a sequence of integers x, such that 1 <= x <= 2<<28-1
- * as a string.
+ * as a string. NOTICE x>=1.
*
* The encoded string is a sequence of 4 byte words (packed int's).
* The high order 2 bits of a given word indicate whether or not
@@ -545,8 +548,7 @@ function nextPostString(&$input_string, &$offset)
return "";
}
$end += 4;
- while($end < $len &&
- $flag_bits >= $continue_threshold) {
+ while($end < $len && $flag_bits >= $continue_threshold) {
$flag_bits = (ord($input_string[$end]) & $flag_mask);
$end += 4;
}
@@ -574,7 +576,7 @@ function decodeModified9($input_string, &$offset)
if (!extension_loaded("yioop") ) {
/**
- * Decoded a single word with high two bits off according to modified 9
+ * Decode a single word with high two bits off according to modified 9
*
* @param string $encoded_list four byte string to decode
* @return array sequence of integers that results from the decoding.
@@ -605,17 +607,20 @@ function unpackListModified9($encoded_list)
$int_string = packInt($encoded_list);
$first_char = ord($int_string[0]);
foreach ($MOD9_NUM_BITS_CODES as $code => $num_bits) {
- if (($first_char & $code) == $code) break;
+ if (($first_char & $code) == $code) {
+ break;
+ }
}
$num_elts = $MOD9_NUM_ELTS_DECODES[$code];
$mask = (1 << $num_bits) - 1;
$int_string[0] = chr($first_char - $code);
$encoded_list = unpackInt($int_string);
}
-
$decoded_list = [];
for ($i = 0; $i < $num_elts; $i++) {
- if (($pre_elt = $encoded_list & $mask) == 0) break;
+ if (($pre_elt = $encoded_list & $mask) == 0) {
+ break;
+ }
array_unshift($decoded_list, $pre_elt);
$encoded_list >>= $num_bits;
}
diff --git a/src/library/WebArchive.php b/src/library/WebArchive.php
index 4fecdded2..a7222f0c6 100755
--- a/src/library/WebArchive.php
+++ b/src/library/WebArchive.php
@@ -329,8 +329,12 @@ class WebArchive
if ((!$is_string && fseek($fh, $offset) == 0 ) || ($is_string
&& $offset < $storage_len)) {
for ($i = 0; $i < $num; $i++) {
- if (!$is_string && feof($fh)) {break; }
- if ($is_string && $offset >= $storage_len) {break; }
+ if (!$is_string && feof($fh)) {
+ break;
+ }
+ if ($is_string && $offset >= $storage_len) {
+ break;
+ }
$object = null;
$compressed_len = ($is_string)
? substr($this->storage, $offset, $compressed_int_len)
diff --git a/src/library/WebArchiveBundle.php b/src/library/WebArchiveBundle.php
index 6f0ef6960..f23b4cb1e 100755
--- a/src/library/WebArchiveBundle.php
+++ b/src/library/WebArchiveBundle.php
@@ -262,7 +262,7 @@ class WebArchiveBundle
if (!$archive_name_exists) {
/* always add a dummy record so an offset 0 of a real record
can never be legit. This is just to be on the safe side
- if a changeDocumentOffsets in IndexShard happens not to work
+ if a changeDocumentOffsets in IndexShard happens not to work.
*/
$dummy_pages = [["DUMMY"]];
$this->partition[$index]->addObjects("DUMMY_OFFSET",
@@ -334,7 +334,7 @@ class WebArchiveBundle
$info['NUM_DOCS_PER_PARTITION'] = -1;
return $info;
}
- $info = unserialize(file_get_contents($dir_name."/description.txt"));
+ $info = unserialize(file_get_contents($dir_name . "/description.txt"));
return $info;
}
/**
diff --git a/src/library/summarizers/Summarizer.php b/src/library/summarizers/Summarizer.php
index 65020eca5..55fa424a6 100644
--- a/src/library/summarizers/Summarizer.php
+++ b/src/library/summarizers/Summarizer.php
@@ -363,7 +363,9 @@ class Summarizer
sort($summary_indices);
$eos = ($lang == 'hi') ? "।" : "."; //default end of sentence symbol
$summary_scores = [];
- $score_pos = 0;
+ $score_pos = 1; /* Starting offset in docs always 1 not 0 so works with
+ modified9 encoding/decoding
+ */
foreach ($summary_indices as $index) {
$sentence = PhraseParser::compressSentence($sentences[$index],
$lang);
diff --git a/tests/UtilityTest.php b/tests/UtilityTest.php
index 9d47d2a4f..226c500dc 100644
--- a/tests/UtilityTest.php
+++ b/tests/UtilityTest.php
@@ -97,17 +97,17 @@ class UtilityTest extends UnitTest
$packed = L\packPosting(33689, $posting_list);
$out_doc_list = L\unpackPosting($packed, $offset, true);
$this->assertEqual($out_doc_list[0], 33689,
- "Doc index from unpack of first word has delta 0 case");
+ "Doc index from unpack of first word has delta[0] case");
$this->assertEqual($out_doc_list[1], $posting_list,
- "Unpack of delta 0 case");
+ "Unpack of delta[0] case");
$offset = 0;
$posting_list = [511, 12000, 24000];
$packed = L\packPosting(33689, $posting_list);
$out_doc_list = L\unpackPosting($packed, $offset, true);
$this->assertEqual($out_doc_list[0], 33689,
- "Doc index from unpack of first word has delta 0 case");
+ "Doc index from unpack of first word has delta[0] case 2");
$this->assertEqual($out_doc_list[1], $posting_list,
- "Unpack of delta 0 case");
+ "Unpack of delta[0] case 2");
$posting_list = [6000, 12000, 24000];
$packed = L\packPosting(100000, $posting_list);
$offset = 0;
@@ -116,6 +116,15 @@ class UtilityTest extends UnitTest
"Bigger Doc index from unpack of long packed posting equal");
$this->assertEqual($out_doc_list[1], $posting_list,
"Bigger Delta unpack of posting equal");
+ $posting_list = [1, 4, 7, 174];
+ $packed = L\packPosting(0, $posting_list);
+ $unpack_int = unpack("N*", $packed);
+ $offset = 0;
+ $out_doc_list = L\unpackPosting($packed, $offset, true);
+ $this->assertEqual($out_doc_list[0], 0,
+ "Doc index from unpack of doc index 0 case");
+ $this->assertEqual($out_doc_list[1], $posting_list,
+ "Unpack of doc index 0 case");
}
/**
* Used to check if the functions to encode decode queue weight are