Fix issues with entity extraction in PhraseParser, fix PHP Other Error for plugins

Chris Pollett [2023-12-06 07:Dec:th]
Fix issues with entity extraction in PhraseParser, fix PHP Other Error for plugins
Filename
src/executables/QueueServer.php
src/library/PhraseParser.php
tests/PhraseParserTest.php
diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php
index 7853a5988..4b8bd7b04 100644
--- a/src/executables/QueueServer.php
+++ b/src/executables/QueueServer.php
@@ -1213,18 +1213,16 @@ class QueueServer implements CrawlConstants
                 $plugin_instance_name =
                     lcfirst($plugin)."Plugin";
                 $plugin_name = C\NS_PLUGINS . $plugin . "Plugin";
-                $this->$plugin_instance_name =
-                    new $plugin_name();
+                $plugin_instance = new $plugin_name();
                 if (method_exists($plugin_name, "setConfiguration") &&
                     isset($this->indexing_plugins_data[$plugin])) {
-                    $this->$plugin_instance_name->setConfiguration(
+                    $plugin_instance->setConfiguration(
                         $this->indexing_plugins_data[$plugin]);
                 }
-                if ($this->$plugin_instance_name) {
+                if ($plugin_instance) {
                     L\crawlLog(
                         "... executing $plugin_instance_name");
-                    $this->$plugin_instance_name->
-                        postProcessing($this->crawl_time);
+                    $plugin_instance->postProcessing($this->crawl_time);
                 }
             }
         }
diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php
index ff9444def..156049c32 100755
--- a/src/library/PhraseParser.php
+++ b/src/library/PhraseParser.php
@@ -424,25 +424,36 @@ class PhraseParser
         if (empty($terms)) {
             return [];
         }
-        $t = 1; /*first position in doc is 1 as will encode with modified9
-             which requires positive numbers
+        $t = 1; /*first position in doc is 1 as some encoding schemes used
+           require positive numbers
         */
         if (strpos($string ?? "", "-") === false) {
             foreach ($terms as $term) {
-                $pos_lists[$term][] = $t++;
+                if (!empty($term)) {
+                    $pos_lists[$term][] = $t++;
+                }
             }
         } else {
             // add all single terms in entity
             foreach ($terms as $term) {
+                if (empty($term) || $term == "-") {
+                    continue;
+                }
                 $pos_lists[$term][] = $t;
                 /* this is to allow for searching by entities and parts
                    of entities
                  */
-
-                $term_parts = explode("-", $term ?? "");
-                array_shift($term_parts);
-                foreach ($term_parts as $part) {
-                    $pos_lists[$part][] = $t;
+                $entity = strtr($term, "-", " ");
+                if ($entity != $term) {
+                    $entity_terms = self::stemCharGramSegment($entity, $lang);
+                    $old_t = $t;
+                    foreach ($entity_terms as $entity_term) {
+                        if (!empty($entity_term)) {
+                            $pos_lists[$entity_term][] = $t;
+                            $t++;
+                        }
+                    }
+                    $t--;
                 }
                 $t++;
             }
diff --git a/tests/PhraseParserTest.php b/tests/PhraseParserTest.php
index 7c1e4ca08..84acf4f19 100644
--- a/tests/PhraseParserTest.php
+++ b/tests/PhraseParserTest.php
@@ -83,6 +83,10 @@ EOD;
         $words = array_keys($word_lists);
         $this->assertTrue(in_array("prime-minist", $words),
             "Extract Entity 1");
+        $this->assertTrue(in_array("prime", $words),
+            "Extract Entity PART 1");
+        $this->assertTrue(in_array("minist", $words),
+            "Extract Entity PART 2");
         $this->assertTrue(in_array("deep", $words), "Unigrams still present 1");
         $this->assertTrue(in_array("space", $words),
             "Unigrams still present 2");
ViewGit