Tweeaks to when whole phrase search used or not, a=chris

Chris Pollett [2019-01-23 00:Jan:rd]

Tweeaks to when whole phrase search used or not, a=chris

Filename
src/executables/Fetcher.php
src/library/IndexManager.php
src/library/PhraseParser.php
src/library/media_jobs/FeedsUpdateJob.php
src/models/PhraseModel.php

diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php
index fca344937..9aeee3e8a 100755
--- a/src/executables/Fetcher.php
+++ b/src/executables/Fetcher.php
@@ -1677,7 +1677,8 @@ class Fetcher implements CrawlConstants
                 if (!isset($this->hosts_with_errors[$host])) {
                     $this->hosts_with_errors[$host] = 0;
                 }
-                if ($response_code >= 400 || $response_code < 100) {
+                if (($response_code >= 400 && $response_code != 404) ||
+                    $response_code < 100) {
                     // < 100 will capture failures to connect which are returned
                     // as strings
                     $was_error = true;
diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php
index 1d548d35c..f1ce3a3ce 100644
--- a/src/library/IndexManager.php
+++ b/src/library/IndexManager.php
@@ -184,11 +184,15 @@ class IndexManager implements CrawlConstants
      *     bits of word id to discard
      * @param int $threshold after the number of results exceeds this amount
      *     stop looking for more dictionary entries.
-     * @param int $start_generation
-     * @param int $num_distinct_generations
-     * @param bool $with_remaining_total
-     * @return array sequence of four tuples:
-     *     (index_shard generation, posting_list_offset, length, exact id
+     * @param int $start_generation what generation in the index to start
+     *      finding occurrence of phrase from
+     * @param int $num_distinct_generations from $start_generation how
+     *      many generation to search forward to
+     * @param bool $with_remaining_total whether to total number of
+     *      postings found as well or not
+     * @return array either [total, sequence of four tuples]
+    *       or sequence of four tuples:
+     *      (index_shard generation, posting_list_offset, length, exact id
      *      that match $hash)
      */
     public static function getWordInfo($index_name, $hash, $shift = 0,
@@ -236,9 +240,11 @@ class IndexManager implements CrawlConstants
      * @param string $index_name index to look up term or phrase in
      * @param int $threshold if set and positive then once threshold many
      *     documents are found the search for more documents to add to the
-     *     total is stoppe
-     * @param int $start_generation
-     * @param int $num_distinct_generations
+     *     total is stopped
+     * @param int $start_generation what generation in the index to start
+     *      finding occurrence of phrase from
+     * @param int $num_distinct_generations from $start_generation how
+     *      many generation to search forward to
      * @return int number of documents
      */
     public static function numDocsTerm($term_or_phrase, $index_name,
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index 66635e1de..d1b49ec1c 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -137,68 +137,78 @@ class PhraseParser
             self::hyphenateEntities($string, $lang);
         }
         $terms = self::stemCharGramSegment($string, $lang);
-        $num = count($terms);
-        if ($index_name == null || $num <= 1 || (class_exists($char_class) &&
-            isset($char_class::$char_gram_len) )) {
+        $num_terms = count($terms);
+        if ($index_name == null || $num_terms <= 1 ||
+            (class_exists($char_class) && isset($char_class::$char_gram_len))) {
             return $terms;
         }
-        if (count($terms) > C\MAX_QUERY_TERMS) {
-            $first_terms = array_slice($terms, 0, C\MAX_QUERY_TERMS);
-            $whole_phrase = implode(" ", $first_terms);
-        } else {
-            $whole_phrase = implode(" ", $terms);
-            $first_terms = $terms;
-        }
-        if ($exact_match) {
-            /* for exact phrase search do not use suffix tree stuff for now */
+        // keep only first C\MAX_QUERY_TERMS many terms
+        if ($num_terms > C\MAX_QUERY_TERMS) {
+            $terms = array_slice($terms, 0, C\MAX_QUERY_TERMS);
+        }
+        $whole_phrase = implode(" ", $terms);
+        if ($exact_match || ($index_name != 'feed' &&
+            IndexManager::getVersion($index_name) == 0)) {
+            /* for exact phrase search do not use suffix tree stuff for now.
+               Also, for old style index before max phrase extraction
+               just return terms
+            */
             return $terms;
         }
         $tokenizer = self::getTokenizer($lang);
+        // query terms are question answer triplet then do no further processing
         if (!empty($tokenizer::$question_token) &&
             stristr($whole_phrase, $tokenizer::$question_token) !== false) {
-            $terms = [$whole_phrase, $terms[0]];
+            return [$whole_phrase];
+        }
+        $terms = self::extractTermsWholePhrase($terms, $index_name,
+            $threshold);
+        return $terms;
+    }
+    /**
+     *
+     */
+    public static function extractTermsWholePhrase($terms, $index_name,
+        $threshold)
+    {
+        $num_terms = count($terms);
+        if ($num_terms <= 1) {
             return $terms;
         }
+        $whole_phrase = implode(" ", $terms);
         $count_whole_phrase = IndexManager::numDocsTerm($whole_phrase,
             $index_name, $threshold);
-        if ($count_whole_phrase >= $threshold
-            || $num > C\PHRASE_THRESHOLD) {
-            $terms = [$whole_phrase, $terms[0]];
+        /*
+           If have more than $threshold (default 10, one page) worth of
+           whole phrase results then use whole phrase for results
+         */
+        if ($count_whole_phrase >= $threshold) {
+            return [$whole_phrase];
+        } else if ($num_terms <= 2) {
             return $terms;
-        } else if ($count_whole_phrase > 0) {
-            foreach ($terms as $term) {
-                $count_term = IndexManager::numDocsTerm($term,
-                    $index_name, 5 * $threshold);
-                if ($count_term > 50 * $count_whole_phrase) {
-                    $terms = [$whole_phrase, $terms[0]];
-                    return $terms;
-                }
-            }
-        } else if ($num > 2) {
-            $start_terms = $first_terms;
-            $last_term = array_pop($start_terms);
-            $start_phrase = implode(" ", $start_terms);
-            $count_start = IndexManager::numDocsTerm($start_phrase,
+        } else {
+            $first_term = array_shift($terms);
+            $extract_terms = self::extractTermsWholePhrase($terms,
                 $index_name, $threshold);
-            if ($count_start >= $threshold) {
-                $terms = [$start_phrase, $last_term, $terms[0]];
-                return $terms;
+            if (count($extract_terms) <= 1) {
+                if ($count_whole_phrase > 0) {
+                    $count_extract_terms = IndexManager::numDocsTerm(
+                        $whole_phrase, $index_name, $threshold);
+                    if ($count_whole_phrase * $threshold >
+                        $count_extract_terms) {
+                        return [$whole_phrase];
+                    }
+                }
+                array_unshift($extract_terms, $first_term);
+                return $extract_terms;
             }
-            $end_terms = $first_terms;
-            $first_term = array_shift($end_terms);
-            $end_phrase = implode(" ", $end_terms);
-            $count_end = IndexManager::numDocsTerm($end_phrase,
+            $last_term = array_pop($extract_terms);
+            array_unshift($extract_terms, $first_term);
+            $rest_terms = self::extractTermsWholePhrase($extract_terms,
                 $index_name, $threshold);
-            if ($count_end >= $threshold) {
-                $terms = [$first_term, $end_phrase];
-                return $terms;
-            }
-        }
-        if ($index_name != 'feed' &&
-            IndexManager::getVersion($index_name) == 0) {
-            return $terms; //old style index before max phrase extraction
+            $rest_terms[] = $last_term;
+            return $rest_terms;
         }
-        return $terms;
     }
     /**
      * Extracts all phrases (sequences of adjacent words) from $string. Does
@@ -315,10 +325,13 @@ class PhraseParser
             }
     }
     /**
+     * Given a string, hyphenates words in the string which appear in
+     * a bloom filter for the given locale as phrases.
+     *
      * @param string& $string a string of words, etc which might involve such
      *      terms
      * @param $lang a language tag to use as part of the canonicalization
-     *     process not used right now
+     *     process
      */
     public static function hyphenateEntities(&$string, $lang = null)
     {
@@ -362,7 +375,7 @@ class PhraseParser
                     $space = " ";
                     $current_entity = "";
                     $last_entity = "";
-                    $lower_last_entity ="";
+                    $lower_last_entity = "";
                     $i = $k;
                     $j = $k - 1;
                 }
diff --git a/src/library/media_jobs/FeedsUpdateJob.php b/src/library/media_jobs/FeedsUpdateJob.php
index 19ec1574f..8c994f357 100644
--- a/src/library/media_jobs/FeedsUpdateJob.php
+++ b/src/library/media_jobs/FeedsUpdateJob.php
@@ -665,7 +665,7 @@ class FeedsUpdateJob extends MediaJob
                     $phrase_string, $lang);
                 $raw_guid = L\unbase64Hash($item["GUID"]);
                 $doc_keys = L\crawlHash($item["LINK"], true) .
-                    $raw_guid."d". substr(L\crawlHash(
+                    $raw_guid . "d". substr(L\crawlHash(
                     UrlParser::getHost($item["LINK"])."/", true), 1);
                 $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'],
                     $source_name, $item["GUID"], $media_category);
diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php
index eebbc2664..3860ec496 100755
--- a/src/models/PhraseModel.php
+++ b/src/models/PhraseModel.php
@@ -579,10 +579,7 @@ class PhraseModel extends ParallelModel
             } else {
                 $new_words =
                     PhraseParser::extractPhrases($phrase_part, $locale_tag,
-                         $index_name);
-                if (isset($new_words[0]) && strpos($new_words[0], " ") > 0) {
-                    array_pop($new_words);
-                }
+                    $index_name);
                 $base_words = array_merge($base_words, $new_words);
             }
             $num_words = count($base_words);

ViewGit