diff --git a/src/configs/Config.php b/src/configs/Config.php index 2097fc733..2d4cf0cae 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -626,6 +626,19 @@ nsconddefine('CACHE_ROBOT_TXT_TIME', ONE_DAY); * files are kept on disk, but might be slower to access if not in cache. */ nsconddefine('SIZE_ROBOT_TXT_CACHE', 1000); +/** + * + */ +nsdefine('ALWAYS_FOLLOW_ROBOTS', 1); +/** + * + */ +nsdefine('ALLOW_LANDING_ROBOTS', 2); +/** + * + */ +nsdefine('IGNORE_ROBOTS', 3); + /** * Whether the scheduler should track ETag and Expires headers. * If you want to turn this off set the variable to false in diff --git a/src/configs/PublicHelpPages.php b/src/configs/PublicHelpPages.php index 1dcbde013..8e263f35d 100644 --- a/src/configs/PublicHelpPages.php +++ b/src/configs/PublicHelpPages.php @@ -2074,6 +2074,33 @@ Finally, the '''Number of Fetchers''' drop down al {{right|[[https://www.seekquarry.com/?c=static&p=Documentation#GUI%20for%20Managing%20Machines%20and%20Servers|Learn More..]]}} EOD; +$help_pages["en-US"]["Max_Depth"] = <<< EOD +page_type=standard + +page_alias= + +page_border=solid-border + +toc=true + +title= + +author= + +robots= + +description= + +alternative_path= + +page_header= + +page_footer= + +sort=aname + +END_HEAD_VARSThe '''Max Depth''' dropdown is used to limit what urls are allowed to be crawl by the number of hops they are from a seed site. For example, if the Max Depth was set to 2, then seed sites would be crawled, sites linked to seed sites would be crawled, and sites linked to sites linked to seed sites would be crawled. A depth 0 crawl only crawls the seed sites. +EOD; $help_pages["en-US"]["Media_Sources"] = <<< EOD page_type=standard @@ -2500,6 +2527,70 @@ their password on their own; * '''Email Link Password Recovery''', a user can specify their login and get emailed a password change link; * '''Email Link and Check Questions Recovery''', a user can specify their login and get emailed a password change link. The password change page requires the user correctly answers previously provided recovery questions. EOD; +$help_pages["en-US"]["Repeat_Type"] = <<< EOD +page_type=standard + +page_alias= + +page_border=solid-border + +toc=true + +title= + +author= + +robots= + +description= + +alternative_path= + +page_header= + +page_footer= + +sort=aname + +END_HEAD_VARSThe '''Repeat Type''' dropdown controls whether a crawl is a repeating crawl or not, and if so, what is its repeat duration. A non-repeating crawl has one index and crawling continues adding to this index until all allowed urls have been crawled or until the administrator stops the crawl. In a non-repeating crawl one has a double index, that consists of a index to serve search results from and an index to crawl into. Once the repeat time has been exceeded the index that was being crawled into becomes the index to serve results from, the previous search index is reset to empty and is then used to crawl into for the next repeat time amount of time. The '''Two Minute''' repeat type can be used to experiment with this behavior. +EOD; +$help_pages["en-US"]["Robots_Behaviors"] = <<< EOD +page_type=standard + +page_alias= + +page_border=solid-border + +toc=true + +title= + +author= + +robots= + +description= + +alternative_path= + +page_header= + +page_footer= + +sort=aname + +END_HEAD_VARSThe '''Robots Behaviors''' dropdown controls the degree to which your Yioop crawler respects '''robots.txt''' files. A '''robots.txt''' is a file placed by a site operator in the document root of their web site. I.e., it would typically have a url like: +https://some_host_name/robots.txt<br> +or<br> +http://some_host_name/robots.txt. +It is used to specify the files that a particular kind of crawler is allowed to download from a site and at what rate. So for example it might have instructions for how the GoogleBot is allowed to crawl the site, how the BingBot is allowed to crawl the site, etc. The available options are: +* '''Always Follow''' which always follows to the best of Yioop's abilities the robots.txt instructions. +* '''Allow Landing Page Crawl''' which allows Yioop to download urls of the form +https://some_host_name/<br> +or<br> +http://some_host_name/ but otherwise respects the robots.txt file. +* '''Ignore''' which allows Yioop to completely ignore the robots.txt file. This option should only be used at your own risk. There might be some use cases such as where you want to crawl part of a site that you yourself own, but where you don't have control of the robots.txt. For the most part, you should not use this option. +EOD; $help_pages["en-US"]["Scrapers"] = <<< EOD page_type=standard @@ -2647,27 +2738,31 @@ robots= description= +alternative_path= + page_header= page_footer= -END_HEAD_VARS'''Seed Sites''' are a list of urls that Yioop should start a crawl from. - -<br /> - -If under Server Settings : Account Registration user's are allowed to register for Yioop accounts at some -level other than completely disabled, then the Tools: Suggest a Url form will be enabled. URLs suggested through this form can be added to the seed sites by clicking the '''Add User Suggest data''' link. These URLS will appear at the end of the seeds sites and will appear with a timestamp of when they added before them. Adding this data to the seed sites clears the list of suggested sites from where it is temporarily stored before being added. - -<br /> - -Some site's robot.txt forbid crawl of the site. If you would like to create a placeholder page for such a site so that a link to that site might still appear in the index, but so that the site itself is not crawled by the crawler, you can use a syntax like: - -<nowiki> -http://www.facebool.com/###! -Facebook###! -A%20famous%20social%20media%20site -</nowiki> +sort=aname +END_HEAD_VARS'''Seed Sites''' are a list of urls that Yioop should start a crawl from. + +<br /> + +If under Server Settings : Account Registration user's are allowed to register for Yioop accounts at some +level other than completely disabled, then the Tools: Suggest a Url form will be enabled. URLs suggested through this form can be added to the seed sites by clicking the '''Add User Suggest data''' link. These URLS will appear at the end of the seeds sites and will appear with a timestamp of when they added before them. Adding this data to the seed sites clears the list of suggested sites from where it is temporarily stored before being added. + +<br /> + +Some site's robot.txt forbid crawl of the site. If you have your crawler configured to always follow the robots.txt file, but would like to create a placeholder page for such a forbidden site so that a link to that site might still appear in the index, yet so that the site itself is not crawled by the crawler, you can use a syntax like: + +<nowiki> +http://www.facebook.com/###! +Facebook###! +A%20famous%20social%20media%20site +</nowiki> + This should all be on one line. Here ###! is used a separator and the format is url##!title###!description. EOD; $help_pages["en-US"]["Start_Crawl"] = <<< EOD diff --git a/src/configs/default_crawl.ini b/src/configs/default_crawl.ini index 8dc1270fa..5211d447a 100644 --- a/src/configs/default_crawl.ini +++ b/src/configs/default_crawl.ini @@ -24,6 +24,7 @@ crawl_order = 'ad'; max_depth = '-1'; repeat_type = '-1'; +robot_txt = '1'; channel = '0'; summarizer_option = 'dk'; crawl_type = 'ax'; diff --git a/src/controllers/SettingsController.php b/src/controllers/SettingsController.php index 27180fd01..5e65710d7 100755 --- a/src/controllers/SettingsController.php +++ b/src/controllers/SettingsController.php @@ -102,13 +102,22 @@ class SettingsController extends Controller } else { $data['OPEN_IN_TABS'] = false; } + if ($token_okay && isset($_REQUEST['perpage'])) { + $_SESSION['SAFE_SEARCH'] = (isset($_REQUEST['safe_search'])) ? + true : false; + } + if (isset($_SESSION['SAFE_SEARCH'])) { + $data['SAFE_SEARCH'] = $_SESSION['SAFE_SEARCH']; + } else { + $data['SAFE_SEARCH'] = true; + } $machine_urls = $this->model("machine")->getQueueServerUrls(); $crawls = $crawl_model->getCrawlList(false, true, $machine_urls, true); $data['CRAWLS'] = []; foreach ($crawls as $crawl) { $data['CRAWLS'][$crawl['CRAWL_TIME']] = $crawl['DESCRIPTION']. - " ... ".$crawl['COUNT']." urls"; + " ... " . $crawl['COUNT']." urls"; } $mixes = $crawl_model->getMixList($user); if (!empty($mixes)) { @@ -121,7 +130,7 @@ class SettingsController extends Controller if ($token_okay) { $changed_settings_flag = $this->loggedInChangeSettings($data); } else if (isset($_REQUEST['its']) && - in_array($_REQUEST['its'],$crawl_stamps)){ + in_array($_REQUEST['its'], $crawl_stamps)){ $data['its'] = $_REQUEST['its']; } else { $data['its'] = $crawl_model->getCurrentIndexDatabaseName(); diff --git a/src/controllers/components/CrawlComponent.php b/src/controllers/components/CrawlComponent.php index b1957e8fa..46496e5da 100644 --- a/src/controllers/components/CrawlComponent.php +++ b/src/controllers/components/CrawlComponent.php @@ -398,6 +398,7 @@ class CrawlComponent extends Component implements CrawlConstants $seed_info['general']['crawl_order']; $crawl_params[self::MAX_DEPTH] = $seed_info['general']['max_depth']; $crawl_params[self::REPEAT_TYPE] = $seed_info['general']['repeat_type']; + $crawl_params[self::ROBOTS_TXT] = $seed_info['general']['robots_txt']; $crawl_params[self::RESTRICT_SITES_BY_URL] = $seed_info['general']['restrict_sites_by_url']; $crawl_params[self::ALLOWED_SITES] = @@ -677,6 +678,20 @@ class CrawlComponent extends Component implements CrawlConstants true : false; $update_flag = true; } + $data['robots_txt_behaviors'] = [ + C\ALWAYS_FOLLOW_ROBOTS => tl('crawl_component_always_follow'), + C\ALLOW_LANDING_ROBOTS => tl('crawl_component_allow_landing'), + C\IGNORE_ROBOTS => tl('crawl_component_ignore'), + ]; + $data['robots_txt'] = empty($seed_info['general']['robots_txt']) ? + C\ALWAYS_FOLLOW_ROBOTS : $seed_info['general']['robots_txt']; + if (!$no_further_changes && isset($_REQUEST['robots_txt']) && + in_array($_REQUEST['robots_txt'], + array_keys($data['robots_txt_behaviors']))) { + $seed_info['general']['robots_txt'] = $_REQUEST['robots_txt']; + echo $seed_info['general']['robots_txt']; + $update_flag = true; + } $data['restrict_sites_by_url'] = $seed_info['general']['restrict_sites_by_url']; $site_types = ['allowed_sites' => 'url', 'disallowed_sites' => 'url', diff --git a/src/data/public_default.db b/src/data/public_default.db index d4d185a6b..661e6d617 100644 Binary files a/src/data/public_default.db and b/src/data/public_default.db differ diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 645459417..49a9847f0 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -2886,8 +2886,12 @@ class Fetcher implements CrawlConstants } else { $link_origin = $site_url; } + $url_info = []; + if (!empty($site[self::LANG])) { + $url_info[self::LANG] = $site[self::LANG]; + } $meta_ids = PhraseParser::calculateLinkMetas($site_url, - $host, $site[self::DESCRIPTION], $link_origin); + $host, $site[self::DESCRIPTION], $link_origin, $url_info); } else { $is_link = false; $site_url = str_replace('|', "%7C", $site[self::URL]); @@ -2939,8 +2943,10 @@ class Fetcher implements CrawlConstants if (isset($this->programming_language_extension[$lang]) || PhraseParser::computeSafeSearchScore($word_lists, $len, $site_url) < 0.012) { + $meta_ids[] = "safe:all"; $meta_ids[] = "safe:true"; } else { + $meta_ids[] = "safe:all"; $meta_ids[] = "safe:false"; } } @@ -3031,11 +3037,9 @@ class Fetcher implements CrawlConstants } $ref = ($elink_flag) ? "eref" : "iref"; $url = str_replace('|', "%7C", $url); - $link_id = - "url|" . $url . "|text|" . urlencode($link_text) . - "|$ref|" . $site_url; - $elink_flag_string = ($elink_flag) ? "e" : - "i"; + $link_id = "url|" . $url . "|text|" . + urlencode($link_text) . "|$ref|" . $site_url; + $elink_flag_string = ($elink_flag) ? "e" : "i"; $link_keys = L\crawlHash($url, true) . L\crawlHash($link_id, true) . $elink_flag_string. @@ -3045,11 +3049,13 @@ class Fetcher implements CrawlConstants // stripping html to be on the safe side $summary[self::DESCRIPTION] = $link_text; $summary[self::TIMESTAMP] = $site[self::TIMESTAMP]; - $summary[self::ENCODING] = $site[self::ENCODING]; + $summary[self::ENCODING] = (empty($site[self::ENCODING])) ? + 'UTF-8' : $site[self::ENCODING]; $summary[self::HASH] = $link_id; $summary[self::TYPE] = "link"; $summary[self::HTTP_CODE] = $link_keys; $summary[self::LANG] = $lang; + $url_info[self::LANG] = $lang; $this->found_sites[self::LINK_SEEN_URLS][$part_num][] = $summary; $link_lists = diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index 3f9d20bcc..8cb5eb76e 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -127,6 +127,13 @@ class QueueServer implements CrawlConstants, Join * @var string */ public $max_depth; + /** + * One of a fixed set of values which are used to control to what extent + * Yioop follows robots.txt files: ALWAYS_FOLLOW_ROBOTS, + * ALLOW_LANDING_ROBOTS, IGNORE_ROBOTS + * @var int + */ + public $robot_txt; /** * Stores the name of the summarizer used for crawling. * Possible values are Basic and Centroid @@ -306,6 +313,7 @@ class QueueServer implements CrawlConstants, Join $this->archive_modified_time = 0; $this->crawl_time = 0; $this->channel = 0; + $this->robots_txt = C\ALWAYS_FOLLOW_ROBOTS; $this->cache_pages = true; $this->page_recrawl_frequency = C\PAGE_RECRAWL_FREQUENCY; $this->page_range_request = C\PAGE_RANGE_REQUEST; @@ -889,6 +897,15 @@ class QueueServer implements CrawlConstants, Join $this->crawl_time); if ($this->crawl_type == self::WEB_CRAWL) { L\crawlLog("Performing a web crawl!"); + L\crawlLog("robots.txt behavior is: "); + if ($this->robots_txt == C\ALWAYS_FOLLOW_ROBOTS) { + L\crawlLog(" Always follow robots.txt"); + } else if ($this->robots_txt == + C\ALLOW_LANDING_ROBOTS) { + L\crawlLog(" Allow landing pages."); + } else { + L\crawlLog(" Ignore robots.txt."); + } } else { L\crawlLog("Performing an archive crawl of " . "archive with timestamp " . @@ -1019,6 +1036,8 @@ class QueueServer implements CrawlConstants, Join -1 : $this->repeat_type; $crawl_status['MAX_DEPTH'] = (empty($this->max_depth)) ? -1 : $this->max_depth; + $crawl_status['ROBOTS_TXT'] = (empty($this->robots_txt)) ? + C\ALWAYS_FOLLOW_ROBOTS : $this->robots_txt; $crawl_status['COUNT'] = 0; $crawl_status['DESCRIPTION'] = $message; file_put_contents($this->crawl_status_file_name, @@ -1215,6 +1234,7 @@ class QueueServer implements CrawlConstants, Join "page_range_request" => self::PAGE_RANGE_REQUEST, "max_depth" => self::MAX_DEPTH, "repeat_type" => self::REPEAT_TYPE, + "robots_txt" => self::ROBOTS_TXT, "max_description_len" => self::MAX_DESCRIPTION_LEN, "page_recrawl_frequency" => self::PAGE_RECRAWL_FREQUENCY, "indexed_file_types" => self::INDEXED_FILE_TYPES, @@ -1407,6 +1427,7 @@ class QueueServer implements CrawlConstants, Join } $updatable_info = [ "repeat_type" =>self::REPEAT_TYPE, + "robots_txt" =>self::ROBOTS_TXT, "page_range_request" => self::PAGE_RANGE_REQUEST, "max_description_len" => self::MAX_DESCRIPTION_LEN, "page_recrawl_frequency" => self::PAGE_RECRAWL_FREQUENCY, @@ -1823,7 +1844,9 @@ class QueueServer implements CrawlConstants, Join */ public function processRobotUrls() { - if (!isset($this->web_queue) ) {return;} + if (!isset($this->web_queue) ) { + return; + } L\crawlLog("Scheduler Checking age of robot data in queue server "); if ($this->web_queue->getRobotTxtAge() > C\CACHE_ROBOT_TXT_TIME) { $this->deleteRobotData(); @@ -2229,6 +2252,8 @@ class QueueServer implements CrawlConstants, Join if (!empty($this->index_archive->repeat_time)) { $crawl_status['REPEAT_TIME'] = $this->index_archive->repeat_time; } + $crawl_status['ROBOTS_TXT'] = (empty($this->robots_txt)) ? + C\ALWAYS_FOLLOW_ROBOTS : $this->robots_txt; $crawl_status['MAX_DEPTH'] = (empty($this->max_depth)) ? -1 : $this->max_depth; $index_archive_class = C\NS_LIB . (($crawl_status['REPEAT_TYPE'] > 0 ) ? @@ -2290,6 +2315,7 @@ class QueueServer implements CrawlConstants, Join $sites[self::CRAWL_ORDER] = $this->crawl_order; $sites[self::MAX_DEPTH] = $this->max_depth; $sites[self::REPEAT_TYPE] = $this->repeat_type; + $sites[self::ROBOTS_TXT] = $this->robots_txt; $sites[self::CRAWL_TYPE] = $this->crawl_type; $sites[self::CRAWL_INDEX] = $this->crawl_index; $sites[self::CACHE_PAGES] = $this->cache_pages; @@ -2422,7 +2448,11 @@ class QueueServer implements CrawlConstants, Join $robots_okay = true; if ($has_robots) { if ($no_flags) { - if (!isset($hard_coded) || !$hard_coded) { + if ($this->robots_txt == C\IGNORE_ROBOTS || + ($this->robots_txt == C\ALLOW_LANDING_ROBOTS && + rtrim($url, "/") == rtrim($host_url, "/"))) { + $robots_okay = true; + } else if (!isset($hard_coded) || !$hard_coded) { $robots_okay = $this->web_queue->checkRobotOkay($url); } else { $robots_okay = true; diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php index 65136d8ba..ebeb5dc09 100755 --- a/src/library/CrawlConstants.php +++ b/src/library/CrawlConstants.php @@ -110,6 +110,7 @@ interface CrawlConstants const DESCRIPTION_SCORES = 'A'; const HEIGHT = 'B'; const WIDTH = 'C'; + const ROBOTS_TXT = 'D'; // codes available here const DOC_DEPTH = 'M'; const DOC_RANK = 'N'; diff --git a/src/library/FetchUrl.php b/src/library/FetchUrl.php index 7a8253c9d..8f3c578eb 100755 --- a/src/library/FetchUrl.php +++ b/src/library/FetchUrl.php @@ -857,7 +857,7 @@ class FetchUrl implements CrawlConstants if (count($agents) > $MAX_AGENTS) { $agent_host = array_shift($agents); if ($agent_host) { - curl_close($agent_host); + @curl_close($agent_host); } } $host = ($post_data) ? "POST " . $host : "GET " . $host; diff --git a/src/library/LocaleFunctions.php b/src/library/LocaleFunctions.php index 28ce714af..d126431df 100755 --- a/src/library/LocaleFunctions.php +++ b/src/library/LocaleFunctions.php @@ -38,6 +38,17 @@ use seekquarry\yioop\models\LocaleModel; /** For Yioop global defines */ require_once __DIR__."/../configs/Config.php"; +/** + * Returns an array of locales that have a stop words list and a stop words + * remover method + * @return array list of locales that have a stopwords list; + */ +function localesWithStopwordsList() +{ + return ['ar', 'bn', 'de', 'en-US', 'es', 'fa', 'fr-FR', 'he', 'hi', + 'in-ID', 'it', 'ja', 'kn', 'ko', 'nl', 'pl', 'pt', 'ru', 'te', 'th', + 'vi-VN', 'zh-CN']; +} /** * Attempts to guess the user's locale based on the request, session, * and user-agent data @@ -96,7 +107,6 @@ function guessLocale() * provided uses current locale's value * @param int threshold number of chars to guess a particular encoding * @return string IANA language tag of the guessed locale - */ function guessLocaleFromString($phrase_string, $locale_tag = null) { @@ -105,9 +115,7 @@ function guessLocaleFromString($phrase_string, $locale_tag = null) } $len = strlen($phrase_string); if ($len >= C\NAME_LEN) { - foreach (['ar', 'bn', 'de', 'en-US', 'es', 'fa', 'fr-FR', 'he', 'hi', - 'in-ID', 'it', 'ja', 'kn', 'ko', 'nl', 'pl', 'pt', 'ru', 'te', 'th', - 'vi-VN', 'zh-CN'] as $lang) { + foreach (localesWithStopwordsList() as $lang) { $tokenizer = PhraseParser::getTokenizer($lang); if ($tokenizer) { $test_len = diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index f49eb242d..16e279e2d 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -1140,7 +1140,7 @@ class PhraseParser } if (isset($site[CrawlConstants::OPERATING_SYSTEM])) { $meta_ids[] = 'os:all'; - $meta_ids[] = 'os:'.strtolower( + $meta_ids[] = 'os:'. strtolower( $site[CrawlConstants::OPERATING_SYSTEM]); } if (isset($site[CrawlConstants::MODIFIED])) { @@ -1153,19 +1153,26 @@ class PhraseParser if (isset($site[CrawlConstants::TIMESTAMP])) { $date = $site[CrawlConstants::TIMESTAMP]; $meta_ids[] = 'date:all'; - $meta_ids[] = 'date:'.date('Y', $date); - $meta_ids[] = 'date:'.date('Y-m', $date); - $meta_ids[] = 'date:'.date('Y-m-d', $date); - $meta_ids[] = 'date:'.date('Y-m-d-H', $date); - $meta_ids[] = 'date:'.date('Y-m-d-H-i', $date); - $meta_ids[] = 'date:'.date('Y-m-d-H-i-s', $date); + $meta_ids[] = 'date:' . date('Y', $date); + $meta_ids[] = 'date:' . date('Y-m', $date); + $meta_ids[] = 'date:' . date('Y-m-d', $date); + $meta_ids[] = 'date:' . date('Y-m-d-H', $date); + $meta_ids[] = 'date:' . date('Y-m-d-H-i', $date); + $meta_ids[] = 'date:' . date('Y-m-d-H-i-s', $date); } if (isset($site[CrawlConstants::LANG])) { $meta_ids[] = 'lang:all'; - $lang_parts = explode("-", $site[CrawlConstants::LANG]); + $lang = strtolower($site[CrawlConstants::LANG]); + $lang_parts = explode("-", $lang); $meta_ids[] = 'lang:' . $lang_parts[0]; - if (isset($lang_parts[1])){ - $meta_ids[] = 'lang:' . strtolower($site[CrawlConstants::LANG]); + if (isset($lang_parts[1])) { + $meta_ids[] = 'lang:' . $lang; + } + if ($lang == 'mul') { + foreach (localesWithStopwordsList() as $lang) { + $lang_parts = explode("-", $lang); + $meta_ids[] = 'lang:' . $lang_parts[0]; + } } } if (isset($site[CrawlConstants::AGENT_LIST])) { @@ -1219,14 +1226,29 @@ class PhraseParser if (!empty($url_info['pubdate']) && $date = strtotime($url_info['pubdate'])) { $link_meta_ids[] = 'date:all'; - $link_meta_ids[] = 'date:'.date('Y', $date); - $link_meta_ids[] = 'date:'.date('Y-m', $date); - $link_meta_ids[] = 'date:'.date('Y-m-d', $date); - $link_meta_ids[] = 'date:'.date('Y-m-d-H', $date); - $link_meta_ids[] = 'date:'.date('Y-m-d-H-i', $date); - $link_meta_ids[] = 'date:'.date('Y-m-d-H-i-s', $date); + $link_meta_ids[] = 'date:' . date('Y', $date); + $link_meta_ids[] = 'date:' . date('Y-m', $date); + $link_meta_ids[] = 'date:' . date('Y-m-d', $date); + $link_meta_ids[] = 'date:' . date('Y-m-d-H', $date); + $link_meta_ids[] = 'date:' . date('Y-m-d-H-i', $date); + $link_meta_ids[] = 'date:' . date('Y-m-d-H-i-s', $date); } $link_meta_ids[] = "link:all"; + if (!empty($url_info[CrawlConstants::LANG])) { + $link_meta_ids[] = 'lang:all'; + $lang = strtolower($url_info[CrawlConstants::LANG]); + $lang_parts = explode("-", $lang); + $link_meta_ids[] = 'lang:' . $lang_parts[0]; + if (isset($lang_parts[1])) { + $link_meta_ids[] = 'lang:' . $lang; + } + if ($lang == 'mul') { + foreach (localesWithStopwordsList() as $lang) { + $lang_parts = explode("-", $lang); + $link_meta_ids[] = 'lang:' . $lang_parts[0]; + } + } + } return $link_meta_ids; } /** diff --git a/src/library/ScraperManager.php b/src/library/ScraperManager.php index 490005e32..57f7abf70 100644 --- a/src/library/ScraperManager.php +++ b/src/library/ScraperManager.php @@ -122,7 +122,7 @@ class ScraperManager $out_text = $value_or_query; } else { if ($xpath = new \DOMXpath($dom)) { - $results = $xpath->query($value_or_query); + $results = @$xpath->query($value_or_query); if (!empty($results) && $results->length > 0) { $len = $results->length; for ($i = 0; $i < $len; $i++) { diff --git a/src/library/processors/CompressedProcessor.php b/src/library/processors/CompressedProcessor.php index 2fc456406..fbb6e3b60 100644 --- a/src/library/processors/CompressedProcessor.php +++ b/src/library/processors/CompressedProcessor.php @@ -84,8 +84,8 @@ class CompressedProcessor extends PageProcessor */ public function process($page, $url) { - if (preg_match('/^(.+)\.(gz|bz|zip)$/', $url, $match) === false && - !empty($match[1])) { + if (preg_match('/^(.+)\.(gz|bz|zip)$/', $url, $match) === false || + empty($match[2])) { return null; } $uncompress_url = $match[1]; diff --git a/src/locale/ar/configure.ini b/src/locale/ar/configure.ini index 29ca71c02..4dc55181e 100755 --- a/src/locale/ar/configure.ini +++ b/src/locale/ar/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "محددات مواقع المعلومات حقن!" crawl_component_update_seed_info = "تحديث معلومات الموقع البذور!" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "إعدادات" settings_view_results_per_page = "النتائج في الصفحة الواحدة:" settings_view_open_in_tabs = "فتح النتائج في تبويب" +settings_view_safe_search = "" settings_view_search_index = "البحث في الفهرس:" settings_view_language_label = "اللغة:" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "الزحف الترتيب:" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "تقييد مواقع بعنوان:" crawloptions_element_allowed_to_crawl = "يسمح بتتبع ارتباطات مواقع" crawloptions_element_disallowed_and_quota_sites = "غير مسموح به مواقع/مواقع مع الحصص" diff --git a/src/locale/bn/configure.ini b/src/locale/bn/configure.ini index d1316a74b..187a9b46f 100755 --- a/src/locale/bn/configure.ini +++ b/src/locale/bn/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "" crawl_component_update_seed_info = "" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "" settings_view_results_per_page = "" settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "" settings_view_language_label = "" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "" crawloptions_element_allowed_to_crawl = "" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/de/configure.ini b/src/locale/de/configure.ini index a5e1c36c3..543899984 100755 --- a/src/locale/de/configure.ini +++ b/src/locale/de/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "" crawl_component_update_seed_info = "" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "" settings_view_results_per_page = "" settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "" settings_view_language_label = "" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "" crawloptions_element_allowed_to_crawl = "" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/en_US/configure.ini b/src/locale/en_US/configure.ini index 895b970db..df9a37e93 100644 --- a/src/locale/en_US/configure.ini +++ b/src/locale/en_US/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "Monthly" crawl_component_bimonthly = "Bimonthly" crawl_component_semiannually = "Semi-Annually" crawl_component_annually = "Annually" +crawl_component_always_follow = "Always Follow" +crawl_component_allow_landing = "Allow Landing Page Crawl" +crawl_component_ignore = "Ignore" crawl_component_urls_injected = "Urls Injected!" crawl_component_update_seed_info = "Updating Seed Site Info!" crawl_component_description = "Index Description:" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "Auto-logout in One Minute!!" settings_view_settings = "Settings" settings_view_results_per_page = "Results/Page:" settings_view_open_in_tabs = "Open in Tabs:" +settings_view_safe_search = "Safe Search:" settings_view_search_index = "Search Index:" settings_view_language_label = "Language:" settings_view_return = "Return" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "Server Channel:" crawloptions_element_crawl_order = "Crawl Order:" crawloptions_element_max_depth = "Max Depth:" crawloptions_element_repeat_type = "Repeat Type:" +crawloptions_element_robots_txt = "Robots.txt:" crawloptions_element_restrict_by_url = "Restrict Sites By Url:" crawloptions_element_allowed_to_crawl = "Allowed To Crawl Sites" crawloptions_element_disallowed_and_quota_sites = "Disallowed Sites/Sites with Quotas" diff --git a/src/locale/es/configure.ini b/src/locale/es/configure.ini index d72298972..0cce485e3 100755 --- a/src/locale/es/configure.ini +++ b/src/locale/es/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "Urls inyectadas!" crawl_component_update_seed_info = "" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "" settings_view_results_per_page = "" settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "" settings_view_language_label = "" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "Orden de Rastreo:" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "Restringir los sitios por URL:" crawloptions_element_allowed_to_crawl = "Permitido para rastrear sitios" crawloptions_element_disallowed_and_quota_sites = "Sitios no permitidos/Sites with Quotas" diff --git a/src/locale/fa/configure.ini b/src/locale/fa/configure.ini index 3cf6077a2..7c132fa2d 100755 --- a/src/locale/fa/configure.ini +++ b/src/locale/fa/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "urlها وارد شدند." crawl_component_update_seed_info = "در حال به روز آوری اطلاعات seed site " crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "تنظیمات" settings_view_results_per_page = "نتایج در هر صفحه:" settings_view_open_in_tabs = "نتایج را در صفحه‌های جدید باز کن:" +settings_view_safe_search = "" settings_view_search_index = "نمایهٔ جستجو:" settings_view_language_label = "زبان:" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "مرتبهٔ خزش:" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "سایت‌ها را بر اساس URL محدود کن:" crawloptions_element_allowed_to_crawl = "اجازهٔ خزیدن در این سایت‌ها هست" crawloptions_element_disallowed_and_quota_sites = "سایت‌های غیرمجاز/سایت‌های با سهمیه بندی" diff --git a/src/locale/fr_FR/configure.ini b/src/locale/fr_FR/configure.ini index adaaffe36..60ffa252d 100755 --- a/src/locale/fr_FR/configure.ini +++ b/src/locale/fr_FR/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "" crawl_component_update_seed_info = "" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "Préférences" settings_view_results_per_page = "Résultats par page" settings_view_open_in_tabs = "Ouvrez les résultats dans un nouvel onglet" +settings_view_safe_search = "" settings_view_search_index = "L'index de recherche:" settings_view_language_label = "Langage:" settings_view_return = "Retour" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "" crawloptions_element_allowed_to_crawl = "" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/he/configure.ini b/src/locale/he/configure.ini index 0f71065c5..94bb82461 100755 --- a/src/locale/he/configure.ini +++ b/src/locale/he/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "" crawl_component_update_seed_info = "" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "הגדרות" settings_view_results_per_page = "" settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "" settings_view_language_label = "" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "" crawloptions_element_allowed_to_crawl = "" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/hi/configure.ini b/src/locale/hi/configure.ini index c3b7f9edc..27d6d4e36 100755 --- a/src/locale/hi/configure.ini +++ b/src/locale/hi/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "" crawl_component_update_seed_info = "" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "सेटिंग्स" settings_view_results_per_page = "परिणाम प्रति पृष्ठ " settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "खोज अनुक्रमणिका" settings_view_language_label = "भाषा" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "" crawloptions_element_allowed_to_crawl = "" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/in_ID/configure.ini b/src/locale/in_ID/configure.ini index 3aa8630a7..cacfe1fa5 100755 --- a/src/locale/in_ID/configure.ini +++ b/src/locale/in_ID/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "" crawl_component_update_seed_info = "" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "" settings_view_results_per_page = "" settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "" settings_view_language_label = "" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "" crawloptions_element_allowed_to_crawl = "" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/it/configure.ini b/src/locale/it/configure.ini index 193e80e11..85f6eba02 100755 --- a/src/locale/it/configure.ini +++ b/src/locale/it/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "URL aggiunti!" crawl_component_update_seed_info = "Aggiornamento info sito di partenza!" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "Impostazioni" settings_view_results_per_page = "Risultati per Pagina:" settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "Cerca nell'Indice:" settings_view_language_label = "Lingua:" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "Ordine Scansione:" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "Limita siti con URL:" crawloptions_element_allowed_to_crawl = "Siti ammessi alla Scansione" crawloptions_element_disallowed_and_quota_sites = "Siti non ammessi/Siti con limiti" diff --git a/src/locale/ja/configure.ini b/src/locale/ja/configure.ini index 94bb069e3..1f9be1b09 100755 --- a/src/locale/ja/configure.ini +++ b/src/locale/ja/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "" crawl_component_update_seed_info = "シッド情報の更新" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "設定" settings_view_results_per_page = "ページごとの結果:" settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "検索指数" settings_view_language_label = "言語:" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "検索の順序" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "URLで制限" crawloptions_element_allowed_to_crawl = "検索ができます" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/kn/configure.ini b/src/locale/kn/configure.ini index 540e943fb..1d9ff6ef3 100755 --- a/src/locale/kn/configure.ini +++ b/src/locale/kn/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "" crawl_component_update_seed_info = "ಮೂಲ ವೆಬ್ ಸೈಟಿನ ಮಾಹಿತಿಯನ್ನು ಪರಿಷ್ಕರಿಸಲಾಗುತ್ತಿದೆ" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "ನಿಗದಿಗಳು" settings_view_results_per_page = "ಪ್ರತಿ ಪುಟದಲ್ಲಿ ಫಲಿತಾಂಶಗಳ ಸಂಖ್ಯೆ:" settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "ಶೋಧನಾ ಸೂಚಿಕೆ" settings_view_language_label = "ಭಾಷೆ:" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "" crawloptions_element_allowed_to_crawl = "" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/ko/configure.ini b/src/locale/ko/configure.ini index e505c6ac7..b115acbcd 100755 --- a/src/locale/ko/configure.ini +++ b/src/locale/ko/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "" crawl_component_update_seed_info = "씨드 사이트 업데이트" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "세팅" settings_view_results_per_page = "한 페이지 당 결과물 " settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "인덱스 찾기:" settings_view_language_label = "언어:" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "크롤 순서:" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "사이트들을 주소로 제한:" crawloptions_element_allowed_to_crawl = "크롤을 허가한 사이트들" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/nl/configure.ini b/src/locale/nl/configure.ini index 52b2650d3..34296e6de 100644 --- a/src/locale/nl/configure.ini +++ b/src/locale/nl/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "Urls Injected!" crawl_component_update_seed_info = "Updating Seed Site Info!" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "instellingen" settings_view_results_per_page = "Resultaten / Pagina:" settings_view_open_in_tabs = "Openen in tabbladen:" +settings_view_safe_search = "" settings_view_search_index = "Zoek Index:" settings_view_language_label = "taal:" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "Crawl Order:" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "Beperken Sites Door Url:" crawloptions_element_allowed_to_crawl = "Toegestaan te kruipen sites" crawloptions_element_disallowed_and_quota_sites = "Afgekeurd sites / locaties met quota" diff --git a/src/locale/pl/configure.ini b/src/locale/pl/configure.ini index ab35706c7..82ca15d08 100755 --- a/src/locale/pl/configure.ini +++ b/src/locale/pl/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "" crawl_component_update_seed_info = "" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "" settings_view_results_per_page = "" settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "" settings_view_language_label = "" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "" crawloptions_element_allowed_to_crawl = "" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/pt/configure.ini b/src/locale/pt/configure.ini index 986c5bfb6..477aee25b 100755 --- a/src/locale/pt/configure.ini +++ b/src/locale/pt/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "" crawl_component_update_seed_info = "" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "" settings_view_results_per_page = "" settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "" settings_view_language_label = "" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "" crawloptions_element_allowed_to_crawl = "" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/ru/configure.ini b/src/locale/ru/configure.ini index 2c0ac78d6..6d3e61bcd 100755 --- a/src/locale/ru/configure.ini +++ b/src/locale/ru/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "" crawl_component_update_seed_info = "" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "" settings_view_results_per_page = "" settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "" settings_view_language_label = "" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "" crawloptions_element_allowed_to_crawl = "" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/te/configure.ini b/src/locale/te/configure.ini index 6888ac855..8785fb928 100644 --- a/src/locale/te/configure.ini +++ b/src/locale/te/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "యుఆరెల్స్ ఇంజెక్ట్ చేయబడినవి!" crawl_component_update_seed_info = "అప్డేటింగ్ సీడ్ సైట్ ఇన్ఫో!" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "సెట్టింగులు" settings_view_results_per_page = "ఫలితాలు/పేజ్:" settings_view_open_in_tabs = "టాబ్స్ లో తెరువు:" +settings_view_safe_search = "" settings_view_search_index = "శోధన సూచిక:" settings_view_language_label = "భాష:" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "" crawloptions_element_allowed_to_crawl = "" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/te/statistics.txt b/src/locale/te/statistics.txt index e8680e29f..d655566cc 100755 --- a/src/locale/te/statistics.txt +++ b/src/locale/te/statistics.txt @@ -1 +1 @@ -d:41; \ No newline at end of file +d:40; \ No newline at end of file diff --git a/src/locale/th/configure.ini b/src/locale/th/configure.ini index 0f5e3dde1..6193dc76f 100755 --- a/src/locale/th/configure.ini +++ b/src/locale/th/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "" crawl_component_update_seed_info = "" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "" settings_view_results_per_page = "" settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "" settings_view_language_label = "" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "" crawloptions_element_allowed_to_crawl = "" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/tr/configure.ini b/src/locale/tr/configure.ini index 30ca2e314..1eca813bc 100755 --- a/src/locale/tr/configure.ini +++ b/src/locale/tr/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "" crawl_component_update_seed_info = "" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "" settings_view_results_per_page = "" settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "" settings_view_language_label = "" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "" crawloptions_element_allowed_to_crawl = "" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/vi_VN/configure.ini b/src/locale/vi_VN/configure.ini index 313406dca..c4d64f7e4 100755 --- a/src/locale/vi_VN/configure.ini +++ b/src/locale/vi_VN/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "" crawl_component_update_seed_info = "Cập nhật thông tin trang mạng lươi hạt giống" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "Sự sắp đặt" settings_view_results_per_page = "Kết quả trong mỗi trang" settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "Tìm mục lục:" settings_view_language_label = "Ngôn ngữ:" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "" crawloptions_element_allowed_to_crawl = "" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/locale/vi_VN/statistics.txt b/src/locale/vi_VN/statistics.txt index 25dc1f752..fe8cd74a8 100755 --- a/src/locale/vi_VN/statistics.txt +++ b/src/locale/vi_VN/statistics.txt @@ -1 +1 @@ -d:8; \ No newline at end of file +d:7; \ No newline at end of file diff --git a/src/locale/zh_CN/configure.ini b/src/locale/zh_CN/configure.ini index 7324865eb..daf594212 100755 --- a/src/locale/zh_CN/configure.ini +++ b/src/locale/zh_CN/configure.ini @@ -382,6 +382,9 @@ crawl_component_monthly = "" crawl_component_bimonthly = "" crawl_component_semiannually = "" crawl_component_annually = "" +crawl_component_always_follow = "" +crawl_component_allow_landing = "" +crawl_component_ignore = "" crawl_component_urls_injected = "插入網址" crawl_component_update_seed_info = "更新種子資訊" crawl_component_description = "" @@ -759,6 +762,7 @@ admin_view_auto_logout_one_minute = "" settings_view_settings = "設定" settings_view_results_per_page = "每頁顯示項目數量" settings_view_open_in_tabs = "" +settings_view_safe_search = "" settings_view_search_index = "" settings_view_language_label = "語言" settings_view_return = "" @@ -1417,6 +1421,7 @@ crawloptions_element_server_channel = "" crawloptions_element_crawl_order = "" crawloptions_element_max_depth = "" crawloptions_element_repeat_type = "" +crawloptions_element_robots_txt = "" crawloptions_element_restrict_by_url = "" crawloptions_element_allowed_to_crawl = "" crawloptions_element_disallowed_and_quota_sites = "" diff --git a/src/models/CrawlModel.php b/src/models/CrawlModel.php index 9de7fa7c4..aacfa123c 100755 --- a/src/models/CrawlModel.php +++ b/src/models/CrawlModel.php @@ -498,6 +498,9 @@ EOT; if (!isset($info['general']['repeat_type'])) { $info['general']['repeat_type'] = -1; } + if (!isset($info['general']['robots_txt'])) { + $info['general']['robots_txt'] = C\ALWAYS_FOLLOW_ROBOTS; + } if (!isset($info['general']['page_recrawl_frequency'])) { $info['general']['page_recrawl_frequency'] = C\PAGE_RECRAWL_FREQUENCY; @@ -512,6 +515,7 @@ EOT; $n[] = "crawl_type = '" . $info['general']['crawl_type'] . "';"; $n[] = "max_depth = '" . $info['general']['max_depth'] . "';"; $n[] = "repeat_type = '" . $info['general']['repeat_type'] . "';"; + $n[] = "robots_txt = '" . $info['general']['robots_txt'] . "';"; $n[] = "crawl_index = '" . $info['general']['crawl_index'] . "';"; $n[] = "channel = '" . $info['general']['channel'] . "';"; $n[] = 'arc_dir = "' . $info["general"]["arc_dir"] . '";'; @@ -637,6 +641,7 @@ EOT; "crawl_order" => [self::CRAWL_ORDER, self::PAGE_IMPORTANCE], "max_depth" => [self::MAX_DEPTH, -1], "repeat_type" => [self::REPEAT_TYPE, -1], + "robots_txt" => [self::ROBOTS_TXT, C\ALWAYS_FOLLOW_ROBOTS], "summarizer_option" => [self::SUMMARIZER_OPTION, self::BASIC_SUMMARIZER], "arc_dir" => [self::ARC_DIR, ''], @@ -709,7 +714,7 @@ EOT; $index_info[self::RESTRICT_SITES_BY_URL] = $new_info['general']["restrict_sites_by_url"]; } - if (isset($new_info['general']["restrict_sites_by_url"])) { + if (isset($new_info['general']["repeat_type"])) { $index_info[self::REPEAT_TYPE] = $new_info['general']["repeat_type"]; } diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index 387143584..4c8da6e0c 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -731,6 +731,7 @@ class PhraseModel extends ParallelModel $found_lang = false; $found_safe = false; $found_media_image = false; + $found_site = false; foreach ($meta_words as $meta_word) { $pattern = "/(\s)($meta_word(\S)+)/"; preg_match_all($pattern, $phrase, $matches); @@ -757,6 +758,9 @@ class PhraseModel extends ParallelModel && substr($matches[0], 0, 11) == 'media:image') { $found_media_image = true; } + if ($meta_word == 'site:' && !empty($matches[0])) { + $found_site = true; + } if ($meta_word == 'safe:' && !empty($matches[0])) { $found_safe = true; } @@ -772,15 +776,16 @@ class PhraseModel extends ParallelModel $phrase_string = ""; } $found_metas = array_unique($found_metas); - if (!$found_media_image && + if (!$found_media_image && !$found_site && (!empty($found_metas) || !empty($phrase_string))) { if (!$found_lang) { $lang_parts = explode("-", $locale_tag); $found_metas[] = "lang:" . $lang_parts[0]; } if (!$found_safe) { - $found_metas[] = (empty($_SESSION['safe'])) ? "safe:true" : - "safe:" . $_SESSION['safe']; + $found_metas[] = (!isset($_SESSION['SAFE_SEARCH']) || + $_SESSION['SAFE_SEARCH']) ? "safe:true" : + "safe:all"; } } $disallow_phrases = array_unique($disallow_phrases); @@ -1554,15 +1559,17 @@ class PhraseModel extends ParallelModel $out_query = ""; $pipe = ""; foreach ($disjunct_phrases as $disjunct) { - if (!stristr($disjunct, "lang:") && - !stristr($disjunct, "media:image")) { - $locale_tag = L\guessLocaleFromString($original_query); - $lang_parts = explode("-", $locale_tag); - $disjunct .= " lang:" . $lang_parts[0]; - } - if (!stristr($disjunct, "safe:")) { - $disjunct .= (empty($_SESSION['safe'])) ? - " safe:true" : " safe:" . $_SESSION['safe']; + if (!stristr($disjunct, "site:")) { + if (!stristr($disjunct, "lang:") && + !stristr($disjunct, "media:image")) { + $locale_tag = L\guessLocaleFromString($original_query); + $lang_parts = explode("-", $locale_tag); + $disjunct .= " lang:" . $lang_parts[0]; + } + if (!stristr($disjunct, "safe:")) { + $disjunct .= (empty($_SESSION['safe'])) ? + " safe:true" : " safe:" . $_SESSION['safe']; + } } $out_query .= $pipe . $disjunct; $pipe = "|"; diff --git a/src/views/SettingsView.php b/src/views/SettingsView.php index 1f2accf24..ab8bcd636 100755 --- a/src/views/SettingsView.php +++ b/src/views/SettingsView.php @@ -80,7 +80,18 @@ class SettingsView extends View tl('settings_view_open_in_tabs') ?></b></label></td><td class="table-input"><input type="checkbox" id="open-in-tabs" name="open_in_tabs" value="true" - <?php if ($data['OPEN_IN_TABS']) {?>checked='checked'<?php } ?> /> + <?php if (!empty($data['OPEN_IN_TABS'])) { + ?>checked='checked'<?php + } ?> /> +</td></tr> +<tr> +<td class="table-label"><label for="afe-search"><b><?= + tl('settings_view_safe_search') ?></b></label></td><td + class="table-input"><input type="checkbox" id="safe-search" + name="safe_search" value="true" + <?php if (!empty($data['SAFE_SEARCH'])) { + ?>checked='checked'<?php + } ?> /> </td></tr> <tr> <td class="table-label"><label for="index-ts"><b><?= @@ -97,7 +108,7 @@ if (count($data['LANGUAGES']) > 1) { ?> <tr><td class="cancel"><input type="hidden" name="<?=C\CSRF_TOKEN ?>" value="<?= $data[C\CSRF_TOKEN] ?>" /><?php if (isset($data['return'])){ ?><input type="hidden" name="return" value="<?= $data['return'] ?>" /> - <?php } ?><?php if (isset($data['oldc'])){ ?><input + <?php } ?><?php if (isset($data['oldc'])) { ?><input type="hidden" name="oldc" value="<?= $data['oldc'] ?>" /> <?php } ?><input type="hidden" name="its" value="<?=$data['its'] ?>" /><button class="top-margin" name="c" value="search" <?php diff --git a/src/views/elements/CrawloptionsElement.php b/src/views/elements/CrawloptionsElement.php index 27a92b2de..ab233213e 100644 --- a/src/views/elements/CrawloptionsElement.php +++ b/src/views/elements/CrawloptionsElement.php @@ -138,6 +138,14 @@ class CrawloptionsElement extends Element "Repeat Type", $data[C\CSRF_TOKEN])); ?> </div> + <div class="top-margin"><label for="robots-txt"><b><?= + tl('crawloptions_element_robots_txt') ?></b></label><?php + $this->view->helper("options")->render("robots-txt", "robots_txt", + $data['robots_txt_behaviors'], $data['robots_txt']); + e(" ".$this->view->helper("helpbutton")->render( + "Robots Behaviors", $data[C\CSRF_TOKEN])); + ?> + </div> <div class="top-margin"><label for="restrict-sites-by-url"><b><?= tl('crawloptions_element_restrict_by_url')?></b></label> <input type="checkbox" id="restrict-sites-by-url" diff --git a/src/views/elements/PageoptionsElement.php b/src/views/elements/PageoptionsElement.php index 39d4f6603..606192d82 100644 --- a/src/views/elements/PageoptionsElement.php +++ b/src/views/elements/PageoptionsElement.php @@ -540,7 +540,8 @@ class PageOptionsElement extends Element <?php if ($data['test_options_active'] != "") { ?> <h2><?=tl('pageoptions_element_test_results')?></h2> <?php - if (strlen($_REQUEST['TESTPAGE']) > $data["PAGE_RANGE_REQUEST"]) { + if (!empty($_REQUEST['TESTPAGE']) && + strlen($_REQUEST['TESTPAGE']) > $data["PAGE_RANGE_REQUEST"]) { e("<h3 class='red'>".tl('pageoptions_element_page_truncated', strlen($_REQUEST['TESTPAGE']), $data["PAGE_RANGE_REQUEST"]). "</h3>");