Last commit for lib/url_parser.php: 9ff742e4cc2ef0dba312dd0c5f642890b6945730

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

Chris Pollett [2015-07-01 02:Jul:st]

First pass at converting files to use autoloading! Take care if you have an old yioop system you are upgrading, a=chris

<?php
/**
 *  SeekQuarry/Yioop --
 *  Open Source Pure PHP Search Engine, Crawler, and Indexer
 *
 *  Copyright (C) 2009 - 2013  Chris Pollett chris@pollett.org
 *
 *  LICENSE:
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *  END LICENSE
 *
 * @author Chris Pollett chris@pollett.org
 * @package seek_quarry
 * @subpackage library
 * @license http://www.gnu.org/licenses/ GPL3
 * @link http://www.seekquarry.com/
 * @copyright 2009 - 2013
 * @filesource
 */

if(!defined('BASE_DIR')) {echo "BAD REQUEST"; exit();}

/**
 * Library of functions used to manipulate and to extract components from urls
 *
 *
 * @author Chris Pollett
 *
 * @package seek_quarry
 * @subpackage library
 */

class UrlParser
{

    /**
     * Checks if the url scheme is either http or https.
     *
     * @param string $url  the url to check
     * @return bool returns true if it is either http or https and false
     *      otherwise
     */
    static function isSchemeHttpOrHttps($url)
    {
        $url_parts = @parse_url($url);

        if(isset($url_parts['scheme']) && $url_parts['scheme'] != "http" &&
            $url_parts['scheme'] != "https") {
            return false;
        }
        return true;
    }

    /**
     * Converts a url with a scheme into one without. Also removes trailing
     * slashes from url. Shortens url to desired length by inserting ellipsis
     * for part of it if necessary
     *
     * @param string $url  the url to trim
     * @param int $max_len length to shorten url to, 0 = no shortening
     * @return string the trimmed url
     */
    static function simplifyUrl($url, $max_len = 0)
    {
        $url_parts = @parse_url($url);
        if(isset($url_parts['scheme'])) {
            $scheme_len = strlen($url_parts['scheme']);
            $url = mb_substr($url, $scheme_len + 3);
        }
        $len = strlen($url);
        if(isset($url[$len - 1]) && $url[$len - 1] == "/") {
            $url = mb_substr($url, 0, $len - 1);
        }
        if($len > 0 && $len > $max_len) {
            $front_len = ceil($max_len * 0.8);
            $end_len = ceil($max_len * 0.2);
            $url = mb_substr($url, 0, $front_len)."...".
                mb_substr($url, -$end_len);
        }
        return $url;
    }

    /**
     * Checks if the url has a host part.
     *
     * @param string $url  the url to check
     * @return bool true if it does; false otherwise
     */
    static function hasHostUrl($url)
    {
       $url_parts = @parse_url($url);

       return isset($url_parts['host']);
    }

    /**
     * Get the host name portion of a url if present; if not return false
     *
     * @param string $url the url to parse
     * @param bool $with_login whether to include user,password,port if present
     * @return the host portion of the url if present; false otherwise
     */
    static function getHost($url, $with_login_and_port = true)
    {
        $url_parts = @parse_url($url);

        if(!isset($url_parts['scheme']) ) {return false;}
        $host_url = $url_parts['scheme'].'://';

        //handles common typo http:/yahoo.com rather than http://yahoo.com
        if(!isset($url_parts['host'])) {
            if(isset($url_parts['path'])) {
                $url_parts = @parse_url($url_parts['scheme'].":/".
                    $url_parts['path']);
                if(!isset($url_parts['host'])) {
                    return false;
                }
            } else {
                return false;
            }
        }

        if($with_login_and_port &&
            isset($url_parts['user']) && isset($url_parts['pass'])) {
            $host_url .= $url_parts['user'].":".$url_parts['pass']."@";
        }

        if(strlen($url_parts['host']) <= 0) { return false; }

        $host_url .= $url_parts['host'];

        if($with_login_and_port && isset($url_parts['port'])) {
            $host_url .= ":".$url_parts['port'];
        }

        return $host_url;
    }

    /**
     *  Attempts to guess the language tag based on url
     *
     *  @param string $url the url to parse
     *  @return the top level domain if present; false otherwise
     */
    static function getLang($url)
    {
        $LANG = array(
            "com" => 'en',
            "edu" => 'en',
            "gov" => 'en',
            "mil" => 'en',
            "org" => 'en',
            "net" => 'en',
            'us' => 'en',
            "uk" => 'en',
            "ca" => 'en',
            "au" => 'en',
            "bz" => 'en',
            "ie" => 'en',
            "jm" => 'en',
            "nz" => 'en',
            "za" => 'en',
            "zw" => 'en',
            "tt" => 'en',
            "eg" => 'ar',
            "dz" => 'ar',
            "bh" => 'ar',
            "jo" => 'ar',
            "kw" => 'ar',
            "lb" => 'ar',
            "iq" => 'ar',
            "ma" => 'ar',
            "om" => 'ar',
            "qa" => 'ar',
            "sa" => 'ar',
            "sy" => 'ar',
            "tn" => 'ar',
            "ae" => 'ar',
            "ye" => 'ar',
            "de" => 'de',
            "at" => "de",
            "es" => 'es',
            "ar" => 'es',
            "bo" => 'es',
            "cl" => 'es',
            "co" => 'es',
            "cr" => 'es',
            "dr" => 'es',
            "ec" => 'es',
            "sv" => 'es',
            "gt" => 'es',
            "hn" => 'es',
            "mx" => 'es',
            "ni" => 'es',
            "pa" => 'es',
            "py" => 'es',
            "pe" => 'es',
            "pr" => 'es',
            "uy" => 'es',
            "ve" => 'es',
            "fr" => 'fr-FR',
            "be" => 'fr-FR',
            "lu" => 'fr-FR',
            "hk" => 'zh-CN',
            "id" => 'in-ID',
            "il" => 'he',
            "ir" => 'fa',
            "it" => 'it',
            "jp" => 'ja',
            "kp" => 'ko',
            "kr" => 'ko',
            "pl" => 'pl',
            "br" => 'pt',
            "pt" => 'pt',
            "qc" => 'fr',
            "ru" => 'ru',
            "sg" => 'zh-CN',
            "th" => 'th',
            "tr" => 'tr',
            "tw" => 'zh-CN',
            "vi" => 'vi-VN',
            "cn" => 'zh-CN',
        );

        $host = self::getHost($url, false);
        if(!$host) return false;

        $host_parts = explode(".", $host);
        $count = count($host_parts);
        if($count > 0) {
            $tld = $host_parts[$count - 1];
            if($tld == 'ca' && isset($host_parts[$count - 2]) &&
                $host_parts[$count - 2] == 'qc') {
                $tld = 'qc';
            }
            if(isset($LANG[$tld])) {
                return $LANG[$tld];
            }
        }

        return NULL;
    }



    /**
     *  Get the path portion of a url if present; if not return NULL
     *
     *  @param string $url the url to parse
     *  @param bool $with_query_string (whether to also include the query
     *      string at the end of the path)
     *  @return the host portion of the url if present; NULL otherwise
     */
    static function getPath($url, $with_query_string = false)
    {
        $url_parts = @parse_url($url);
        if(!isset($url_parts['path'])) {
            return NULL;
        }
        // windows hack
        $url_parts['path'] = str_replace("\/", "/", $url_parts['path']);

        $path = $url_parts['path'];
        $len = strlen($url);
        if($with_query_string && isset($url_parts['query'])) {
            $path .= "?".$url_parts['query'];
        } else if($with_query_string && $url[$len - 1] == "?") {
            $path .= "?"; //handle blank query string case
        }
        return $path;
    }

    /**
     * Gets an array of prefix urls from a given url. Each prefix contains at
     * least the the hostname of the the start url
     *
     * http://host.com/b/c/ would yield http://host.com/ , http://host.com/b,
     * http://host.com/b/, http://host.com/b/c, http://host.com/b/c/
     *
     * @param string $url the url to extract prefixes from
     * @return array the array of url prefixes
     */
    static function getHostPaths($url)
    {
        $host_paths = array($url);

        $host = self::getHost($url);
        if(!$host) {return $host_paths;}

        $host_paths[] = $host;

        $path = self::getPath($url);

        $path_parts = explode("/", $path);

        $url = $host;
        foreach($path_parts as $part) {
         if($part != "") {
            $url .="/$part";
            $host_paths[] = $url;
            }
            $host_paths[] = $url."/";
        }

        $host_paths = array_unique($host_paths);

        return $host_paths;

    }

    /**
     * Gets the subdomains of the host portion of a url. So
     *
     * http://a.b.c/d/f/
     * will return a.b.c, .a.b.c, b.c, .b.c, c, .c
     *
     * @param string $url the url to extract prefixes from
     * @return array the array of url prefixes
     */
    static function getHostSubdomains($url)
    {
        $subdomains = array();
        $url_parts = @parse_url($url);
        if(!isset($url_parts['host']) || strlen($url_parts['host']) <= 0) {
            return $subdomains;
        }
        $host = $url_parts['host'];
        $host_parts = explode(".", $host);
        $num_parts = count($host_parts);
        $domain = "";
        for($i = $num_parts - 1; $i >= 0 ; $i--) {
            $domain = $host_parts[$i].$domain;
            $subdomains[] = $domain;
            $domain = ".$domain";
            $subdomains[] = $domain;
        }

        return $subdomains;
    }

    /**
     * Checks if $path matches against any of the Robots.txt style regex
     * paths in $paths
     *
     * @param string $path a path component of a url
     * @param array $robot_paths in format of robots.txt regex paths
     * @return bool whether it is a member or not
     */
    static function isPathMemberRegexPaths($path, $robot_paths)
    {
        $is_member = false;

        $len = strlen($path);
        foreach($robot_paths as $robot_path) {
            $rlen = strlen($robot_path);
            if($rlen == 0) continue;
            $end_match = false;
            $end = ($robot_path[$rlen - 1] == "$") ? 1 : 0;
            $path_string = substr($robot_path, 0, $rlen - $end);
            $path_parts = explode("*", $path_string);
            $offset = -1;
            $old_part_len = 0;
            $is_match = true;
            foreach($path_parts as $part) {
                $offset += 1 + $old_part_len;
                $old_part_len = strlen($part);
                if($part == "") {
                    continue;
                } else if($offset >= $len) {
                    $is_match = false;
                    break;
                }
                $new_offset = stripos($path, $part, $offset);
                if($new_offset === false || ($offset == 0 && $new_offset > 0)) {
                    $is_match = false;
                    break;
                }
                $offset = $new_offset;
            }
            if($is_match) {
                if($end == 0 || strlen($part) + $offset == $len) {
                    $is_member = true;
                }
            }
        }
        return $is_member;
    }


    /**
     * Given a url, extracts the words in the host part of the url
     * provided the url does not have a path part more than / .
     * Ignores a leading www and also ignore tld.
     *
     * For example, "http://www.yahoo.com/" returns " yahoo "
     *
     * @param string $url a url to figure out the file type for
     *
     * @return string space separated words extracted.
     *
     */
    static function getWordsIfHostUrl($url)
    {
        $words = array();
        $url_parts = @parse_url($url);
        if(!isset($url_parts['host']) || strlen($url_parts['host']) <= 0
            || (isset($url_parts['path']) && $url_parts['path'] != "/")||
            isset($url_parts['query'])
            || isset($url_parts['fragment'])) {
            // if no host or has a query string bail
            return "";
        }
        $host = $url_parts['host'];
        $host_parts = explode(".", $host);
        if(count($host_parts) <= 1) {
            return "";
        }
        array_pop($host_parts); // get rid of tld
        if(stristr($host_parts[0],"www")) {
            array_shift($host_parts);
        }
        $words = array_merge($words, $host_parts);
        $word_string = " ".implode(" ", $words). " ";
        return $word_string;
    }

    /**
     * Given a url, extracts the words in the last path part of the url
     * For example,
     * http://us3.php.net/manual/en/function.array-filter.php
     * yields " function array filter "
     *
     * @param string $url a url to figure out the file type for
     *
     * @return string space separated words extracted.
     *
     */
    static function getWordsLastPathPartUrl($url)
    {
        $words = array();
        $url_parts = @parse_url($url);
        $path_info = @pathinfo($url_parts['path']);
        $path = "";
        if(isset($path_info['dirname'])) {
            $path .= $path_info['dirname']."/";
        }
        if(isset($path_info['filename'])) {
            $path .= $path_info['filename'];
        }
        $pre_path_parts = explode("/", $path);
        $count = count($pre_path_parts);
        if($count < 1 ) {
            return "";
        }
        $last_path = $pre_path_parts[$count - 1];
        $path_parts = preg_split("/(_|-|\ |\+|\.)/", $last_path);
        foreach($path_parts as $part) {
            if(strlen($part) > 0 ) {
                $words[] = $part;
            }
        }
        $word_string = " ".implode(" ", $words). " ";
        return $word_string;
    }

    /**
     * Given a url, makes a guess at the file type of the file it points to
     *
     * @param string $url a url to figure out the file type for
     *
     * @return string the guessed file type.
     *
     */
    static function getDocumentType($url)
    {

        $url_parts = @parse_url($url);

        if(!isset($url_parts['path'])) {
            return "html"; //we default to html
        } else if ($url[strlen($url) - 1] == "/"||$url[strlen($url) - 1]=="\\"){
            return "html";
        } else {
            $path_parts = pathinfo($url_parts['path']);

            if(!isset($path_parts["extension"]) ) {
             return "html"; //we default to html
            }

            return $path_parts["extension"];
        }

    }

    /**
     * Gets the filename portion of a url if present;
     * otherwise returns "Some File"
     *
     * @param string $url a url to parse
     * @return string the filename portion of this url
     */
    static function getDocumentFilename($url)
    {

        $url_parts = @parse_url($url);

        if(!isset($url_parts['path'])) {
            return "html"; //we default to html
        } else {
            $path_parts = pathinfo($url_parts['path']);

            if(!isset($path_parts["filename"]) ) {
                return "Some File";
            }

            return $path_parts["filename"];
        }

    }

    /**
     * Get the query string component of a url
     *
     * @param string $url  a url to get the query string out of
     * @return string the query string if present; NULL otherwise
     */
    static function getQuery($url)
    {
        $url_parts = @parse_url($url);
        if(isset($url_parts['query'])) {
            $out = $url_parts['query'];
        } else {
            $out = NULL;
        }

        return $out;
    }

    /**
     * Get the url fragment string component of a url
     *
     * @param string $url  a url to get the url fragment string out of
     * @return string the url fragment string if present; NULL otherwise
     */
    static function getFragment($url)
    {
        $url_parts = @parse_url($url);
        if(isset($url_parts['fragment'])) {
            $out = $url_parts['fragment'];
        } else {
            $out = NULL;
        }

        return $out;
    }

    /**
     * Given a $link that was obtained from a website $site, returns
     * a complete URL for that link.
     * For example, the $link
     * some_dir/test.html
     * on the $site
     * http://www.somewhere.com/bob
     * would yield the complete url
     * http://www.somewhere.com/bob/some_dir/test.html
     *
     * @param string $link  a relative or complete url
     * @param string $site  a base url
     * @param string $no_fragment if false then if the url had a fragment
     *      (#link_within_page) then the fragement will be included
     *
     * @return string a complete url based on these two pieces of information
     *
     */
    static function canonicalLink($link, $site, $no_fragment = true)
    {

        if(!self::isSchemeHttpOrHttps($link)) {return NULL;}

        if(isset($link[0]) &&
            $link[0] == "/" && isset($link[1]) && $link[1] == "/") {
            $http = ($site[4] == 's') ? "https:" : "http:";
            $link = $http . $link;
        }

        if(self::hasHostUrl($link)) {
            $host = self::getHost($link);
            $path = self::getPath($link);
            $query = self::getQuery($link);
            $fragment = self::getFragment($link);
        } else {

            $host = self::getHost($site);

            if($link !=NULL && $link[0] =="/") {
                $path = $link;
            } else {
                $site_path = self::getPath($site);
                $site_path_parts = pathinfo($site_path);

                if(isset($site_path_parts['dirname'])) {
                    $pre_path = $site_path_parts['dirname'];
                } else {
                    $pre_path = "";
                }
                if(isset($site_path_parts['basename']) &&
                    !isset($site_path_parts['extension'])) {
                    $pre_path .="/".$site_path_parts['basename'];
                }

                if(strlen($link) > 0) {
                     $pre_path = ($link[0] !="#") ? $pre_path."/".$link :
                        $pre_path . $link;
                }
                $path = self::getPath($pre_path);
                $so_far_link = $host . $pre_path;
                $query = self::getQuery($so_far_link);
                $fragment = self::getFragment($so_far_link);
            }
        }


        // take a stab at paths containing ..
        $path = preg_replace('/(\/\w+\/\.\.\/)+/', "/", $path);


        // if still has .. give up
        if(stristr($path, "../"))
        {
            return NULL;
        }

        // handle paths with dot in it
        $path = preg_replace('/(\.\/)+/', "", $path);
        $path = str_replace(" ", "%20", $path);


        $link_path_parts = pathinfo($path);

        $path2 = $path;
        do {
            $path = $path2;
            $path2 = str_replace("//","/", $path);
        } while($path != $path2);

        $path = str_replace("/./","/", $path);
        if($path == "." || substr($path, -2) == "/.") {
            $path = "/";
        }
        if($path == "" && !(isset($fragment) && $fragment !== "")) {
            $path = "/";
        }

        $url = $host.$path;

        if(isset($query) && $query !== "") {
            $url .= "?".$query;
        }

        if(isset($fragment) && $fragment !== "" && !$no_fragment) {
            $url .= "#".$fragment;
        }
        return $url;
    }

    /**
     * Checks if a url has a repeated set of subdirectories, and if the number
     * of repeats occurs more than some threshold number of times
     *
     *  A pattern like bob/.../bob counts as own reptition.
     * bob/.../alice/.../bob/.../alice would count as two (... should be read
     * as ellipsis, not a directory name).If the threshold is three and there
     * are at least three repeated mathes this function return true; it returns
     * false otherwise.
     *
     * @param string $url the url to check
     * @param int $repeat_threshold the number of repeats of a subdir name to
     *      trigger a true response
     * @return bool whether a repeated subdirectory name with more matches than
     *      the threshold was found
     *
     */
    static function checkRecursiveUrl($url, $repeat_threshold = 3)
    {
        $url_parts = mb_split("/", $url);

        $count= count($url_parts);
        $flag = 0;
        for($i = 0; $i < $count; $i++) {
            for($j = 0; $j < $i; $j++) {
                if($url_parts[$j] == $url_parts[$i]) {
                    $flag++;
                }
            }
        }

        if($flag > $repeat_threshold) {
            return true;
        }

        return false;

    }

    /**
     * Checks if a $url is on localhost
     *
     * @param string $url the url to check
     * @return bool whether or not it is on localhost
     */
    static function isLocalhostUrl($url)
    {
        $host = UrlParser::getHost($url, false);

        $localhosts = array("localhost", "127.0.0.1", "::1");
        if(isset($_SERVER["SERVER_NAME"])) {
            $localhosts[] = $_SERVER["SERVER_NAME"];
            $localhosts[] = gethostbyname($_SERVER["SERVER_NAME"]);
        }
        if(isset($_SERVER["SERVER_ADDR"])) {
            $localhosts[] = $_SERVER["SERVER_ADDR"];
        }
        foreach($localhosts as $localhost) {
            if(stristr($host, $localhost)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Checks if a URL corresponds to a known playback page of a video
     * sharing site
     *
     * @param string $url the url to check
     * @param array $video_prefixes an array of prefixes of video sites
     * @return bool whether or not corresponds to video playback page of a known
     *      video site
     */
    static function isVideoUrl(&$url, $video_prefixes)
    {
        if(strlen($url) <= 0 ) {
            return false;
        }
        foreach($video_prefixes as $prefix) {
            $quoted = preg_quote($prefix, "/");
            $patterns[] = "/$quoted/";
        }

        foreach($patterns as $pattern) {
            if(preg_match($pattern, $url) > 0) {
                return true;
            }
        }
        return false;
    }

    /**
     *  Used to delete links from array of links $links based on whether
     *  they are the same as the site they came from (or otherwise judged
     *  irrelevant)
     *
     *  @param array $links pairs of the form $link =>$text
     *  @param string $parent_url a site that the links were found on
     *  @return array just those links which pass the relevancy test
     */
    static function cleanRedundantLinks($links, $parent_url)
    {
        $out_links = array();
        foreach($links as $url => $text) {
            //ignore links back to oneself (too easy to spam)
            if(strcmp($parent_url, $url) != 0) {
                $out_links[$url] = $text;
            }
        }
        return $out_links;
    }
}

?>

ViewGit