Yioop_V9.5_Source_Code_Documentation

CrawlConstants

Shared constants and enums used by components that are involved in the crawling process

Tags
author

Chris Pollett

Table of Contents

ACTIVE_CLASSIFIERS  = 'cu'
ACTIVE_CLASSIFIERS_DATA  = 'cv'
ACTIVE_RANKERS  = 'db'
AGENT_LIST  = "bz"
ALLOWED_SITES  = 'aa'
ARC_DATA  = "cd"
ARC_DIR  = "cb"
ARC_TYPE  = "cc"
archive_base_name  = "Archive"
ARCHIVE_CRAWL  = 'ay'
ASCENDING  = 1
used for word iterator direction
AUX_DOCS  = 'ar'
AVERAGE_COLOR  = 'el'
BASIC_SUMMARIZER  = 'dk'
BOTH  = "IndexerAndScheduler"
Used to say what kind of queue_server this is
BREADTH_FIRST  = 'ac'
CACHE_PAGE_PARTITION  = 'ap'
CACHE_PAGE_VALIDATION_DATA  = 'cy'
CACHE_PAGE_VALIDATORS  = 'cx'
CACHE_PAGES  = 'cn'
CENTROID_SUMMARIZER  = 'dl'
CENTROID_WEIGHTED_SUMMARIZER  = 'dt'
CHANNEL  = 'eb'
CLD_IN_COMMON  = 'av'
CONTENT_SIZE  = 'dx'
CONTINUE_STATE  = 1
CRAWL_DELAY  = 'v'
CRAWL_INDEX  = 'ba'
CRAWL_ORDER  = 'Y'
crawl_status_file  = "CrawlStatus.txt"
CRAWL_TIME  = 'b'
CRAWL_TYPE  = 'az'
current_machine_info_file  = "CurrentMachineInfo.txt"
used by MediaUpdater to know what machine it is for distributed jobs
CURRENT_SERVER  = "bv"
DATA  = 'bt'
DEBUG  = "E"
DESCENDING  = -1
DESCRIPTION  = 't'
DESCRIPTION_SCORES  = 'A'
DIRECTION  = "F"
DISALLOWED_SITES  = 'ab'
DNS_TIME  = "by"
DOC_DEPTH  = 'M'
DOC_ID  = 'am'
DOC_INFO  = 'r'
DOC_LEN  = 'bi'
DOC_RANK  = 'N'
DOMAIN_WEIGHTS  = 'bm'
double_index_base_name  = "DoubleIndexData"
DUMMY  = 'V'
DURATION  = 'ee'
ENCODING  = 'f'
END_ITERATOR  = 'ct'
FALLBACK_PROCESSOR  = 'ep'
FAVICON_URL  = "K"
FEED_CRAWL_TIME  = 100
media feed index archive bundle timestamp
feed_index_data_base_name  = "IndexDataFeed"
fetch_archive_iterator  = "FetchArchiveIterator"
fetch_closed_name  = "FetchClosed"
fetch_crawl_info  = "FetchInfo"
FETCHER_QUEUE_SERVER_RATIO  = 'es'
FILE_NAME  = 'df'
FILETYPE  = 'ag'
GENERATION  = 'aq'
GRAPH_BASED_SUMMARIZER  = 'ds'
HASH  = 'o'
HASH_SEEN_URLS  = 'aj'
HASH_URL  = 'T'
HASH_URL_COUNT  = 'as'
HEADER  = 'bb'
HEIGHT  = 'B'
HOST_BUDGETING  = 'ad'
HTTP_CODE  = 'c'
IMAGE_LINK  = "dr"
INDEX  = 'z'
index_closed_name  = "IndexClosed"
index_data_base_name  = "IndexData"
INDEX_VERSION  = 'ei'
INDEXED_FILE_TYPES  = 'bq'
INDEXER  = "Indexer"
Used to say what kind of queue_server this is
INDEXING_PLUGINS  = 'bl'
INDEXING_PLUGINS_DATA  = "dd"
INI  = 'cq'
INLINKS  = 'P'
INVERTED_INDEX  = 'i'
IP_ADDRESSES  = 'au'
IS_BLACK_AND_WHITE  = 'em'
IS_DOC  = 'at'
IS_FEED  = 'ch'
IS_GOPHER_URL  = "dp"
IS_SAFE  = 'eh'
IS_VIDEO  = 'cg'
IS_VR  = 'ed'
JUST_METAS  = 'aw'
KEY  = "ce"
KEYWORD_LINKS  = 'cs'
LANG  = 'bg'
LINK_SEEN_URLS  = 'cj'
LINKS  = 'w'
local_ip_cache_file  = "LocalIpCache.txt"
LOCATION  = 'bp'
LOGGING  = 'cl'
MACHINE  = 'h'
MACHINE_ID  = 'cf'
MACHINE_URI  = 'ae'
MAX  = 1
Used in priority queue
MAX_DEPTH  = 'dz'
MAX_DESCRIPTION_LEN  = 'cw'
MAX_LINKS_TO_EXTRACT  = 'do'
MEMORY_USAGE  = 'al'
messages_data_base_name  = "MessagesData"
META_WORDS  = 'cm'
MIN  = -1
MINIMUM_FETCH_LOOP_TIME  = "dq"
mirror_table_name  = "MirrorTable.txt"
MODIFIED  = 'bf'
MORE_FILTER_TERMS  = "ej"
name_archive_iterator  = "NameArchiveIterator"
NEEDS_OFFSET_FLAG  = 0x7fffffff
network_base_name  = "Network"
network_crawllist_base_name  = "NetworkCrawlList"
network_status_file  = "NetworkStatus.txt"
NEW_CRAWL  = 'Q'
NO_DATA_STATE  = 2
NO_RANGE  = 'dy'
NUM_ICON_IMAGES  = "L"
NUM_PARTITIONS  = 'cz'
OFFSET  = 'R'
OPERATING_SYSTEM  = 'be'
PAGE  = 'q'
PAGE_RANGE_REQUEST  = 'br'
PAGE_RECRAWL_FREQUENCY  = 'bs'
PAGE_RULES  = 'ao'
PARTITION_NUM  = 'da'
PATHS  = 'S'
PINNED  = "G"
POSITION_LIST  = 'bn'
POST_MAX_SIZE  = 'ck'
PROCESSOR  = 'eo'
PROXIMITY  = 'bo'
PROXY_SERVERS  = 'di'
PUBDATE  = 'ef'
QUESTION_ANSWERS  = 'dw'
queue_base_name  = "QueueBundle"
QUEUE_SERVERS  = "bu"
RECENT_URLS  = 'ak'
REDO_STATE  = 4
RELEVANCE  = 'an'
REPEAT_TYPE  = 'ea'
REPOSITORY_TYPE  = 'de'
RESTRICT_SITES_BY_URL  = 'Z'
ROBOT_INSTANCE  = 'bh'
ROBOT_METAS  = "ca"
ROBOT_PATHS  = 'n'
robot_table_name  = "RobotTable.txt"
File name of file used to store when various fetchers contained a given QueueServer machine
ROBOT_TXT  = 'x'
ROBOTS_TXT  = 'D'
save_point  = "SavePoint"
SAVED_CRAWL_TIMES  = 'j'
schedule_name  = "FetchSchedule"
schedule_start_name  = "StartCrawlSchedule.txt"
SCHEDULE_TIME  = 'k'
SCHEDULER  = "Scheduler"
Used to say what kind of queue_server this is
SCORE  = 'X'
SCRAPER_INFO  = 'eq'
SCRAPER_LABEL  = 'du'
SCRAPERS  = 'dv'
SEEN_URLS  = 'g'
SEQUENCE_NUMBER  = 'er'
SERVER  = 'bc'
SERVER_VERSION  = 'bd'
SHA_HASH  = 'dg'
SITE_INFO  = 'af'
SITES  = 'W'
SIZE  = "bw"
SLEEP_DURATION  = "I"
SLEEP_START  = "H"
SOURCE_NAME  = 'ci'
START_PARTITION  = 'cp'
statistics_base_name  = "Statistics"
STATUS  = 'a'
STOP_STATE  = -1
starts of daemon processes
SUBDOCS  = 'bj'
SUBDOCTYPE  = 'bk'
SUMMARIZER_OPTION  = 'dm'
SUMMARY  = 'ah'
SUMMARY_OFFSET  = 'U'
THUMB  = 'u'
THUMB_URL  = 'ec'
TIMESTAMP  = 'd'
TITLE  = 's'
TO_CRAWL  = 'y'
TOP_LEVEL_LINKS  = "J"
TOR_PROXY  = 'dh'
TOTAL_TIME  = "bx"
TYPE  = 'e'
UI_FLAGS  = 'cr'
URL  = 'l'
URL_INFO  = 'ai'
URL_PARENT  = "ek"
URL_WEIGHT  = 'O'
USER_RANKS  = "dc"
VIEW_COUNT  = 'en'
WAITING_START_MESSAGE_STATE  = 3
WARC_ID  = 'co'
WEB_CRAWL  = 'ax'
WEIGHT  = 'm'
WIDTH  = 'C'
WORD_CLOUD  = 'dn'

Constants

ACTIVE_CLASSIFIERS_DATA

public mixed ACTIVE_CLASSIFIERS_DATA = 'cv'

ASCENDING

used for word iterator direction

public mixed ASCENDING = 1

BOTH

Used to say what kind of queue_server this is

public mixed BOTH = "IndexerAndScheduler"

CACHE_PAGE_VALIDATION_DATA

public mixed CACHE_PAGE_VALIDATION_DATA = 'cy'

CACHE_PAGE_VALIDATORS

public mixed CACHE_PAGE_VALIDATORS = 'cx'

CENTROID_WEIGHTED_SUMMARIZER

public mixed CENTROID_WEIGHTED_SUMMARIZER = 'dt'

crawl_status_file

public mixed crawl_status_file = "CrawlStatus.txt"

current_machine_info_file

used by MediaUpdater to know what machine it is for distributed jobs

public mixed current_machine_info_file = "CurrentMachineInfo.txt"

double_index_base_name

public mixed double_index_base_name = "DoubleIndexData"

FEED_CRAWL_TIME

media feed index archive bundle timestamp

public mixed FEED_CRAWL_TIME = 100

feed_index_data_base_name

public mixed feed_index_data_base_name = "IndexDataFeed"

fetch_archive_iterator

public mixed fetch_archive_iterator = "FetchArchiveIterator"

fetch_closed_name

public mixed fetch_closed_name = "FetchClosed"

FETCHER_QUEUE_SERVER_RATIO

public mixed FETCHER_QUEUE_SERVER_RATIO = 'es'

GRAPH_BASED_SUMMARIZER

public mixed GRAPH_BASED_SUMMARIZER = 'ds'

index_closed_name

public mixed index_closed_name = "IndexClosed"

index_data_base_name

public mixed index_data_base_name = "IndexData"

INDEXER

Used to say what kind of queue_server this is

public mixed INDEXER = "Indexer"

INDEXING_PLUGINS_DATA

public mixed INDEXING_PLUGINS_DATA = "dd"

local_ip_cache_file

public mixed local_ip_cache_file = "LocalIpCache.txt"

messages_data_base_name

public mixed messages_data_base_name = "MessagesData"

MINIMUM_FETCH_LOOP_TIME

public mixed MINIMUM_FETCH_LOOP_TIME = "dq"

mirror_table_name

public mixed mirror_table_name = "MirrorTable.txt"

name_archive_iterator

public mixed name_archive_iterator = "NameArchiveIterator"

network_crawllist_base_name

public mixed network_crawllist_base_name = "NetworkCrawlList"

network_status_file

public mixed network_status_file = "NetworkStatus.txt"

PAGE_RECRAWL_FREQUENCY

public mixed PAGE_RECRAWL_FREQUENCY = 'bs'

robot_table_name

File name of file used to store when various fetchers contained a given QueueServer machine

public mixed robot_table_name = "RobotTable.txt"

schedule_start_name

public mixed schedule_start_name = "StartCrawlSchedule.txt"

SCHEDULER

Used to say what kind of queue_server this is

public mixed SCHEDULER = "Scheduler"

statistics_base_name

public mixed statistics_base_name = "Statistics"

STOP_STATE

starts of daemon processes

public mixed STOP_STATE = -1

WAITING_START_MESSAGE_STATE

public mixed WAITING_START_MESSAGE_STATE = 3

        

Search results