viewgit/inc/functions.php:22 Function utf8_encode() is deprecated [8192]
Filename | |
---|---|
src/controllers/CrawlController.php | |
src/models/Model.php | |
src/models/ParallelModel.php | |
src/models/PhraseModel.php | |
src/views/SearchView.php |
diff --git a/src/controllers/CrawlController.php b/src/controllers/CrawlController.php index 8c7648989..91d5e735c 100644 --- a/src/controllers/CrawlController.php +++ b/src/controllers/CrawlController.php @@ -237,7 +237,7 @@ class CrawlController extends Controller implements CrawlConstants $num = $this->clean($_REQUEST["num"], "int"); $i = $this->clean($_REQUEST["i"], "int"); $crawl_model->current_machine = $i; - list($lookups, $exclude_fields) = + list($lookups, $exclude_fields, $format_words, $description_length) = unserialize(L\webdecode($_REQUEST["arg"])); $our_lookups = []; foreach ($lookups as $lookup => $lookup_info) { @@ -260,7 +260,7 @@ class CrawlController extends Controller implements CrawlConstants } } $items = $crawl_model->getCrawlItems($our_lookups, null, - $exclude_fields); + $exclude_fields, $format_words, $description_length); $this->web_site->header("Content-Type: application/octet-stream"); $items["ELAPSED_TIME"] = L\changeInMicrotime($start_time); $items = gzdeflate(serialize($items)); diff --git a/src/models/Model.php b/src/models/Model.php index a2f00bfaf..f873655ce 100755 --- a/src/models/Model.php +++ b/src/models/Model.php @@ -49,7 +49,6 @@ require_once __DIR__."/../library/Utility.php"; */ class Model implements CrawlConstants { - const SCORE_PRECISION = 4; const SNIPPET_TITLE_LENGTH = 20; const MAX_SNIPPET_TITLE_LENGTH = 20; const SNIPPET_LENGTH_LEFT = 20; @@ -169,9 +168,8 @@ class Model implements CrawlConstants return file_put_contents($filename, $data); } /** - * Given an array page summaries, for each summary extracts snippets which - * are related to a set of search words. For each snippet, bold faces the - * search terms, and then creates a new summary array. + * Given an array page summaries, for each summaru check if url corresponds + * to a search result that was human edited, if so, replace and format it. * * @param array $results web pages summaries (these in turn are * arrays!) @@ -179,7 +177,7 @@ class Model implements CrawlConstants * @param int $description_length length of the description * @return array summaries which have been snippified and bold faced */ - public function formatPageResults($results, $words = null, + public function addEditedPageResults($results, $words = null, $description_length = self::DEFAULT_DESCRIPTION_LENGTH) { if (isset($results['PAGES'])) { @@ -214,60 +212,76 @@ class Model implements CrawlConstants $page[$field] = $summary[$field]; } } + $page = $this->formatSinglePageResult($page, $words, + $description_length); + $pages[$i] = $page; } } - if (empty($page[self::TITLE])) { - $page[self::TITLE] = ""; - } - $page[self::TITLE] = strip_tags($page[self::TITLE]); - $page[self::DESCRIPTION] = strip_tags( - preg_replace("/\<\s+([a-zA-Z])/", '<$1', - $page[self::DESCRIPTION])); - if (strlen($page[self::TITLE]) == 0) { - $offset = min(mb_strlen($page[self::DESCRIPTION]), - self::SNIPPET_TITLE_LENGTH); - $end_title = mb_strpos($page[self::DESCRIPTION], " ", $offset); - $ellipsis = ""; - if ($end_title > self::SNIPPET_TITLE_LENGTH) { - $ellipsis = "..."; - if ($end_title > self::MAX_SNIPPET_TITLE_LENGTH) { - $end_title = self::MAX_SNIPPET_TITLE_LENGTH; - } - } - $page[self::TITLE] = mb_substr($page[self::DESCRIPTION], 0, - $end_title) . $ellipsis; - //still no text revert to url - if (strlen($page[self::TITLE]) == 0 && - isset($page[self::URL])) { - $page[self::TITLE] = $page[self::URL]; + } + $output['TOTAL_ROWS'] = $results['TOTAL_ROWS']; + $output['PAGES'] = ($deleted_a_page) ? $pages : array_values($pages); + return $output; + } + /** + * Given a page summary, extracts snippets which + * are related to a set of search words. For each snippet, bold faces the + * search terms, and then creates a new summary array. + * + * @param array $page a single search result summary + * @param array $words keywords (typically what was searched on) + * @param int $description_length length of the description + * @return array $page which has been snippified and bold faced + */ + public function formatSinglePageResult($page, $words = null, + $description_length = self::DEFAULT_DESCRIPTION_LENGTH) + { + if (empty($page[self::TITLE])) { + $page[self::TITLE] = ""; + } + $page[self::TITLE] = strip_tags($page[self::TITLE]); + $page[self::DESCRIPTION] = strip_tags( + preg_replace("/\<\s+([a-zA-Z])/", '<$1', + $page[self::DESCRIPTION])); + if (strlen($page[self::TITLE]) == 0) { + $offset = min(mb_strlen($page[self::DESCRIPTION]), + self::SNIPPET_TITLE_LENGTH); + $end_title = mb_strpos($page[self::DESCRIPTION], " ", $offset); + $ellipsis = ""; + if ($end_title > self::SNIPPET_TITLE_LENGTH) { + $ellipsis = "..."; + if ($end_title > self::MAX_SNIPPET_TITLE_LENGTH) { + $end_title = self::MAX_SNIPPET_TITLE_LENGTH; } } - // do a little cleaning on text - if ($words != null) { - $page[self::TITLE] = - $this->boldKeywords($page[self::TITLE], $words); - if (!isset($page[self::IS_FEED])) { - $page[self::DESCRIPTION] = - $this->getSnippets($page[self::DESCRIPTION], - $words, $description_length); - } + $page[self::TITLE] = mb_substr($page[self::DESCRIPTION], 0, + $end_title) . $ellipsis; + //still no text revert to url + if (strlen($page[self::TITLE]) == 0 && + isset($page[self::URL])) { + $page[self::TITLE] = $page[self::URL]; + } + } + // do a little cleaning on text + if ($words != null) { + $page[self::TITLE] = + $this->boldKeywords($page[self::TITLE], $words); + if (!isset($page[self::IS_FEED])) { $page[self::DESCRIPTION] = - $this->boldKeywords($page[self::DESCRIPTION], $words); - } else { - $page[self::DESCRIPTION] = mb_substr($page[self::DESCRIPTION], - 0, $description_length); + $this->getSnippets($page[self::DESCRIPTION], + $words, $description_length); } - $pre_description = preg_replace("/\p{C}+|^[^\p{L}]+/u", "", - $page[self::DESCRIPTION]); - $page[self::DESCRIPTION] = (substr($pre_description,0,2) == "b>") ? - "<" . $pre_description : $pre_description; - $page[self::SCORE] = mb_substr($page[self::SCORE], 0, - self::SCORE_PRECISION); - $pages[$i] = $page; + $page[self::DESCRIPTION] = + $this->boldKeywords($page[self::DESCRIPTION], $words); + } else { + $page[self::DESCRIPTION] = mb_substr($page[self::DESCRIPTION], + 0, $description_length); } - $output['TOTAL_ROWS'] = $results['TOTAL_ROWS']; - $output['PAGES'] = ($deleted_a_page) ? $pages : array_values($pages); - return $output; + $page[self::TITLE] = trim($page[self::TITLE], " ."); + $pre_description = preg_replace("/\p{C}+|^[^\p{L}]+/u", "", + $page[self::DESCRIPTION]); + $page[self::DESCRIPTION] = (substr($pre_description, 0, 2) == "b>") ? + "<" . $pre_description : $pre_description; + return $page; } /** * Given a string, extracts a snippets of text related to a given set of @@ -279,16 +293,30 @@ class Model implements CrawlConstants * @param string $text haystack to extract snippet from * @param array $words keywords used to make look in haystack * @param string $description_length length of the description desired - * @param bool $words_change getSnippets might be called many times on - * the same search page with the same $words, if true then the - * preprocessing of $words is avoided and cached versions are used * @return string a concatenation of the extracted snippets of each word */ - public function getSnippets($text, $words, $description_length, - $words_change = false) + public function getSnippets($text, $words, $description_length) { static $search_words = []; + static $last_words = ""; static $word_regex = ""; + if (mb_strlen($text) < $description_length) { + return $text; + } + if (empty($words)) { + $snippet_string = mb_substr($text, 0, $description_length); + $rpos = strrpos($snippet_string, " "); + if ($rpos) { + $snippet_string = mb_substr($snippet_string, 0, $rpos); + } + return $snippet_string; + } + $word_string = implode(" ", $words); + $words_change = false; + if ($word_string != $last_words) { + $words_change = true; + $last_words = $word_string; + } $start_regex = "/"; $left = self::SNIPPET_LENGTH_LEFT; $left3 = $left - 3; @@ -297,16 +325,11 @@ class Model implements CrawlConstants $start_regex2 = "/\b(\w{3}.{0,$left3})?(?:(?:"; $end_regex = "/ui"; $end_regex2 = ").{0,$right}\b)+/ui"; - if (mb_strlen($text) < $description_length) { - return $text; - } $ellipsis = ""; if ($words_change || empty($search_words)) { - $search_words = []; - foreach ($words as $word) { - $search_words = array_merge($search_words, explode(" ", $word)); - } - $search_words = array_filter(array_unique($search_words)); + // orginal list of words might have had space separated phrases; + $search_words = array_filter(array_unique( + explode(" ", $word_string))); $word_regex = ""; $delim = ""; foreach ($search_words as $word) { @@ -322,41 +345,40 @@ class Model implements CrawlConstants $len = mb_strlen($text_source); $offset = 0; if ($len < self::MIN_SNIPPET_LENGTH) { - if (preg_match($start_regex . $word_regex. + if (preg_match($start_regex . $word_regex . $end_regex, $text_source, $match)) { if (stristr($snippet_string, $text_source) === false) { - $snippet_string .= $ellipsis. $text_source; + $snippet_string .= $ellipsis . $text_source; $ellipsis = " ... "; - if (mb_strlen($snippet_string) >= $description_length) { - break; - } } } - continue; - } - $word_locations = []; - preg_match_all($start_regex2 . $word_regex . $end_regex2, - $text_source, $matches); - if (isset($matches[0])) { - $seen_match = []; - foreach ($matches[0] as $match) { - if ($match >= $description_length) { - $match = mb_substr($match, 0, $description_length); - $rpos = strrpos($match, " "); - if ($rpos) { - $match = mb_substr($match, 0, $rpos); - } - } - $match = trim($match, "."); - if (stristr($snippet_string, $match) === false) { - $snippet_string .= $ellipsis. $match; - $ellipsis = " ... "; - if (mb_strlen($snippet_string) >= $description_length) { - break; + } else { + preg_match_all($start_regex2 . $word_regex . $end_regex2, + $text_source, $matches); + if (isset($matches[0])) { + $seen_match = []; + foreach ($matches[0] as $match) { + $match = trim($match, "."); + if (stristr($snippet_string, $match) === false) { + $snippet_string .= $ellipsis. $match; + $ellipsis = " ... "; + if (mb_strlen($snippet_string) >= + $description_length) { + break; + } } } } } + if (mb_strlen($snippet_string) >= $description_length) { + $snippet_string = mb_substr($snippet_string, 0, + $description_length); + $rpos = strrpos($snippet_string, " "); + if ($rpos) { + $snippet_string = mb_substr($snippet_string, 0, $rpos); + } + break; + } } return $snippet_string; } @@ -383,7 +405,7 @@ class Model implements CrawlConstants /** * Gets a list of all DBMS that work with the search engine * - * @return array Names of availabledatasources + * @return array Names of available data sources */ public function getDbmsList() { diff --git a/src/models/ParallelModel.php b/src/models/ParallelModel.php index 1b97675d4..51269b15d 100755 --- a/src/models/ParallelModel.php +++ b/src/models/ParallelModel.php @@ -110,17 +110,22 @@ class ParallelModel extends Model * the crawlItem but which should be excluded from the result. * This will make the result smaller and so hopefully faster to * transmit + * @param array $format_words words which should be highlighted in + * search snippets returned + * @param int $description_length length of snippets to be returned + * for each search result * @return array of summary data for the matching documents */ public function getCrawlItems($lookups, $machine_urls = null, - $exclude_fields = []) + $exclude_fields = [], $format_words = null, + $description_length = self::DEFAULT_DESCRIPTION_LENGTH) { if (!empty($machine_urls) && !$this->isSingleLocalhost($machine_urls)) { $summaries = $this->networkGetCrawlItems($lookups, $machine_urls, - $exclude_fields); + $exclude_fields, $format_words, $description_length); } else { $summaries = $this->nonNetworkGetCrawlItems($lookups, - $exclude_fields); + $exclude_fields, $format_words, $description_length); } return $summaries; } @@ -129,7 +134,7 @@ class ParallelModel extends Model * by their url, or by group of 5-tuples of the form * (machine, key, index, generation, offset). This makes an execMachines * call to make a network request to the CrawlController's on each machine - * which in turn calls getCrawlItems (and thence nonNetworkGetCrawlItems) + * which in turn calls getCrawlItems (and thence nonNetworkGetCrawlItems) * on each machine. The results are then sent back to networkGetCrawlItems * and aggregated. * @@ -139,10 +144,15 @@ class ParallelModel extends Model * the crawlItem but which should be excluded from the result. * This will make the result smaller and so hopefully faster to * transmit + * @param array $format_words words which should be highlighted in + * search snippets returned + * @param int $description_length length of snippets to be returned + * for each search result * @return array of summary data for the matching documents */ public function networkGetCrawlItems($lookups, $machine_urls, - $exclude_fields = []) + $exclude_fields = [], $format_words = null, $description_length = + self::DEFAULT_DESCRIPTION_LENGTH) { //Set-up network request $machines = []; @@ -169,7 +179,8 @@ class ParallelModel extends Model } //Make request $page_set = $this->execMachines("getCrawlItems", - $machines, serialize([$lookups, $exclude_fields]), $num_machines); + $machines, serialize([$lookups, $exclude_fields, + $format_words, $description_length]), $num_machines); //Aggregate results $summaries = []; $elapsed_times = []; @@ -239,9 +250,15 @@ class ParallelModel extends Model * the crawlItem but which should be excluded from the result. * This will make the result smaller and so hopefully faster to * transmit + * @param array $format_words words which should be highlighted in + * search snippets returned + * @param int $description_length length of snippets to be returned + * for each search result * @return array of summary data for the matching documents */ - public function nonNetworkGetCrawlItems($lookups, $exclude_fields = []) + public function nonNetworkGetCrawlItems($lookups, $exclude_fields = [], + $format_words = null, $description_length = + self::DEFAULT_DESCRIPTION_LENGTH) { $summary_offset = null; $generation = null; @@ -377,6 +394,13 @@ class ParallelModel extends Model unset($summaries[$key][$exclude_field]); } } + if ($format_words !== null && count($summaries) > 0 && + $description_length > 0) { + foreach ($summaries as $key => $summary) { + $summaries[$key] = $this->formatSinglePageResult($summary, + $format_words, $description_length); + } + } return $summaries; } /** @@ -409,7 +433,7 @@ class ParallelModel extends Model } $num_generations = $index_archive->generation_info['ACTIVE']; $hash_key = ($is_key) ? L\crawlHashWord($url_or_key, true) : - L\crawlHashWord("info:".$url_or_key, true); + L\crawlHashWord("info:" . $url_or_key, true); $info = IndexManager::getWordInfo($index_name, $hash_key, 0, 1); if (!isset($info[0][4])) { return false; diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index 2fa807312..f9f5158b6 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -377,6 +377,13 @@ class PhraseModel extends ParallelModel } } } + if ($format) { + if (count($format_words) == 0) { + $format_words = null; + } + } else { + $format_words = null; + } if (C\QUERY_STATISTICS) { $this->query_info['QUERY'] .= "$in2<b>Presentation Parse time</b>: " . @@ -390,7 +397,7 @@ class PhraseModel extends ParallelModel $out_results = $this->getSummariesByHash($word_structs, $low, $phrase_num, $filter, $use_cache_if_allowed, $raw, $queue_servers, $phrase, $save_timestamp_name, - $limit_feeds); + $limit_feeds, $format_words); if (isset($out_results['PAGES']) && count($out_results['PAGES']) != 0) { $out_count = 0; @@ -465,38 +472,15 @@ class PhraseModel extends ParallelModel } elseif (isset($results['PAGES'])) { $results['TOTAL_ROWS'] = count($results['PAGES']); } - if ($format) { - if (count($format_words) == 0) { - $format_words = null; - } - } else { - $format_words = null; - } - $description_length = self::DEFAULT_DESCRIPTION_LENGTH; - /* additional meta word come from indexing plugins which might need - longer description lengths, say for recipes - */ - if (isset($this->additional_meta_words) && - is_array($this->additional_meta_words)) { - foreach ($this->additional_meta_words as $meta_word => $length) { - $pattern = "/$meta_word/"; - if (preg_match($pattern, $input_phrase)) { - $description_length = $length; - break; // only match the first found - } - } - } if ($raw == 0 && isset($results['TOTAL_ROWS']) && $results['TOTAL_ROWS'] > 0) { - $output = $this->formatPageResults($results, $format_words, + $results = $this->addEditedPageResults($results, $format_words, $description_length); if (!empty($answer_score_map)) { arsort($answer_score_map); reset($answer_score_map); - $output['BEST_ANSWER'] = key($answer_score_map); + $results['BEST_ANSWER'] = key($answer_score_map); } - } else { - $output = $results; } if (C\QUERY_STATISTICS) { $this->query_info['QUERY'] .= "<b>Format Time</b>: ". @@ -506,7 +490,7 @@ class PhraseModel extends ParallelModel $this->db->total_time += $this->query_info['ELAPSED_TIME']; $this->db->query_log[] = $this->query_info; } - return $output; + return $results; } /** * Parses from a string phrase representing a conjunctive query, a struct @@ -1045,13 +1029,14 @@ class PhraseModel extends ParallelModel * docs after $save_timestamp 's previous iterate position. * @param bool $limit_feeds if true the number of feed shard items to * allow in search results is limited to WordIterator::LIMIT_FEEDS_COUNT - * + * @param array $format_words words which should be highlighted in + * search snippets returned * @return array document summaries */ public function getSummariesByHash($word_structs, $limit, $num, &$filter, $use_cache_if_allowed = true, $raw = 0, $queue_servers = [], $original_query = "", $save_timestamp_name = "", - $limit_feeds = true) + $limit_feeds = true, $format_words = null) { $indent= " "; $in2 = $indent . $indent; @@ -1314,10 +1299,26 @@ class PhraseModel extends ParallelModel $out_pages = []; $cur_limit = $start_slice; $with_qa = (preg_match("/\bqqq\b/i", $original_query)) ? true : false; + // now calculate snippet length + $description_length = self::DEFAULT_DESCRIPTION_LENGTH; + /* additional meta word come from indexing plugins which might need + longer description lengths, say for recipes + */ + if (isset($this->additional_meta_words) && + is_array($this->additional_meta_words)) { + foreach ($this->additional_meta_words as $meta_word => $length) { + $pattern = "/$meta_word/"; + if (preg_match($pattern, $original_query)) { + $description_length = $length; + break; // only match the first found + } + } + } while (count($out_pages) < $to_get_count && $get_pages) { $out_pages = array_merge($out_pages, $this->getSummariesFromOffsets($get_pages, $queue_servers, - $raw, $groups_with_docs, $with_qa)); + $raw, $groups_with_docs, $with_qa, $format_words, + $description_length)); if ($save_timestamp_name != "") { break; } @@ -1383,11 +1384,24 @@ class PhraseModel extends ParallelModel * contain at least one doc as opposed to a groups with only links * @param bool $with_question_answer_info whether question answer info * in summaries needs to be returned + * @param array $format_words words which should be highlighted in + * search snippets returned + * @param int $description_length length of snippets to be returned + * for each search result * @return array pages with summaries added */ - public function getSummariesFromOffsets(&$pages, &$queue_servers, $raw, - $groups_with_docs, $with_question_answer_info) + public function getSummariesFromOffsets(&$pages, &$queue_servers, + $raw, $groups_with_docs, $with_question_answer_info, + $format_words = null, $description_length = + self::DEFAULT_DESCRIPTION_LENGTH) { + if ($raw != 0) { + $format_words = null; + } else { + if ($format_words == null) { + $format_words = []; + } + } $lookups = []; $summary_exclude_fields = [self::HEADER, self::PAGE, self::LINKS, self::DESCRIPTION_SCORES]; @@ -1429,7 +1443,7 @@ class PhraseModel extends ParallelModel /* look up items (items we have a link summary for, but not doc summary)*/ $summaries = $this->getCrawlItems($lookups, $lookup_queue_servers, - $summary_exclude_fields); + $summary_exclude_fields, $format_words, $description_length); $lookups = []; // link summaries we want to remember in case don't have doc summary $link_summaries = []; @@ -1459,7 +1473,7 @@ class PhraseModel extends ParallelModel } // lookup redirects $loc_summaries = $this->getCrawlItems($lookups, $lookup_queue_servers, - $summary_exclude_fields); + $summary_exclude_fields, $format_words, $description_length); // delete summaries we found from $link_summaries if (is_array($loc_summaries)) { $loc_hashes = array_keys($loc_summaries); diff --git a/src/views/SearchView.php b/src/views/SearchView.php index 308df2155..d5ab36eee 100755 --- a/src/views/SearchView.php +++ b/src/views/SearchView.php @@ -52,6 +52,10 @@ class SearchView extends View implements CrawlConstants * Represent extension of Git urls */ const GIT_EXTENSION = ".git"; + /** + * Number of decimals for search result scores + */ + const SCORE_PRECISION = 4; /** * Draws the main landing pages as well as search result pages * @@ -440,8 +444,8 @@ class SearchView extends View implements CrawlConstants e($label . ":" . number_format($score, 2) . "\n"); } } - ?>" ><?=tl('search_view_score', $page[self::SCORE]) ?></span> - <?php + ?>" ><?=tl('search_view_score',number_format($page[self::SCORE], + self::SCORE_PRECISION))?></span><?php } ?> </p>