diff --git a/src/configs/Config.php b/src/configs/Config.php index 47e137718..293cacc7f 100755 --- a/src/configs/Config.php +++ b/src/configs/Config.php @@ -710,6 +710,8 @@ nsconddefine('MAX_LINKS_PER_PAGE', 50); nsconddefine('AVG_LINKS_PER_PAGE', 24); /** maximum number of links to consider from a sitemap page */ nsconddefine('MAX_LINKS_PER_SITEMAP', MEMORY_PROFILE * 80); +/** minimum char length of link text before gets its own document */ +nsconddefine('MIN_LINKS_TEXT_DOC', 6); /** maximum number of words from links to consider on any given page */ nsconddefine('MAX_LINKS_WORD_TEXT', 100); /** maximum length of urls to try to queue, this is important for diff --git a/src/configs/PublicHelpPages.php b/src/configs/PublicHelpPages.php index ca3f33c2a..03444eaca 100644 --- a/src/configs/PublicHelpPages.php +++ b/src/configs/PublicHelpPages.php @@ -77,620 +77,620 @@ page_footer= sort=aname -END_HEAD_VARS=Yioop Wiki Syntax= - -: Wiki syntax is a lightweight way to markup a text document so that -it can be formatted and drawn nicely by Yioop. -This page briefly describes the wiki syntax supported by Yioop. - -==Headings== -: In wiki syntax headings of documents and sections are written as follows: - -<nowiki> -=Level1= -==Level2== -===Level3=== -====Level4==== -=====Level5===== -======Level6====== -</nowiki> - -and would look like: - -=Level1= -==Level2== -===Level3=== -====Level4==== -=====Level5===== -======Level6====== - -==Paragraphs== -: In Yioop two new lines indicates a new paragraph. You can control -the indent of a paragraph by putting colons followed by a space in front of it: - -<nowiki> -: some indent - -:: a little more - -::: even more - -:::: that's sorta crazy -</nowiki> - -which looks like: - -: some indent - -:: a little more - -::: even more - -:::: that's sorta crazy - -==Horizontal Rule== -: Sometimes it is convenient to separate paragraphs or sections with a horizontal -rule. This can be done by placing four hyphens on a line by themselves: -<nowiki> ----- -</nowiki> -This results in a line that looks like: ----- - -==Text Formatting Within Paragraphs== -: Within a paragraph it is often convenient to make some text bold, italics, -underlined, etc. Below is a quick summary of how to do this: -===Wiki Markup=== -{| -|<nowiki>''italic''</nowiki>|''italic'' -|- -|<nowiki>'''bold'''</nowiki>|'''bold''' -|- -|<nowiki>'''''bold and italic'''''</nowiki>|'''''bold and italic''''' -|} - -===HTML Tags=== -: Yioop also supports several html tags such as: -{| -|<nowiki><del>delete</del></nowiki>|<del>delete</del> -|- -|<nowiki><ins>insert</ins></nowiki>|<ins>insert</ins> -|- -|<nowiki><s>strike through</s> or -<strike>strike through</strike> </nowiki>|<s>strike through</s> -|- -|<nowiki><sup>superscript</sup> and -<sub>subscript</sub></nowiki>|<sup>superscript</sup> and -<sub>subscript</sub> -|- -|<nowiki><tt>typewriter</tt></nowiki>|<tt>typewriter</tt> -|- -|<nowiki><u>underline</u></nowiki>|<u>underline</u> -|} - -===Spacing within Paragraphs=== -: The HTML entity -<nowiki>&nbsp;</nowiki> -can be used to create a non-breaking space. The tag -<nowiki><br></nowiki> -can be used to produce a line break. - -==Preformatted Text and Unformatted Text== -: You can force text to be formatted as you typed it rather -than using the layout mechanism of the browser using the -<nowiki><pre>preformatted text tag.</pre></nowiki> -Alternatively, a sequence of lines all beginning with a -space character will also be treated as preformatted. - -: Wiki markup within pre tags is still parsed by Yioop. -If you would like to add text that is not parsed, enclosed -it in <tt><`mbox{nowiki}`> </`mbox{nowiki}`></tt> tags. - -==Styling Text Paragraphs== -: Yioop wiki syntax offers a number of templates for -control the styles, and alignment of text for -a paragraph or group of paragraphs:<br /> -`{{`left| some text`}}`,<br /> `{{`right| some text`}}`,<br /> -and<br /> -`{{`center| some text`}}`<br /> can be used to left-justify, -right-justify, and center a block of text. For example, -the last command, would produce: -{{center| -some text -}} -If you know cascading style sheets (CSS), you can set -a class or id selector for a block of text using:<br /> -`{{`class="my-class-selector" some text`}}`<br />and<br /> -`{{`id="my-id-selector" some text`}}`.<br /> -You can also apply inline styles to a block of text -using the syntax:<br /> -`{{`style="inline styles" some text`}}`.<br /> -For example, `{{`style="color:red" some text`}}` looks -like {{style="color:red" some text}}. - -==Lists== -: The Yioop Wiki Syntax supported of ways of listing items: -bulleted/unordered list, numbered/ordered lists, and -definition lists. Below are some examples: - -===Unordered Lists=== -<nowiki> -* Item1 -** SubItem1 -** SubItem2 -*** SubSubItem1 -* Item 2 -* Item 3 -</nowiki> -would be drawn as: -* Item1 -** SubItem1 -** SubItem2 -*** SubSubItem1 -* Item 2 -* Item 3 - -===Ordered Lists=== -<nowiki> -# Item1 -## SubItem1 -## SubItem2 -### SubSubItem1 -# Item 2 -# Item 3 -</nowiki> -# Item1 -## SubItem1 -## SubItem2 -### SubSubItem1 -# Item 2 -# Item 3 - -===Mixed Lists=== -<nowiki> -# Item1 -#* SubItem1 -#* SubItem2 -#*# SubSubItem1 -# Item 2 -# Item 3 -</nowiki> -# Item1 -#* SubItem1 -#* SubItem2 -#*# SubSubItem1 -# Item 2 -# Item 3 - -===Definition Lists=== -<nowiki> -;Term 1: Definition of Term 1 -;Term 2: Definition of Term 2 -</nowiki> -;Term 1: Definition of Term 1 -;Term 2: Definition of Term 2 - -==Tables== -: A table begins with {`|` and ends with `|`}. Cells are separated with | and -rows are separated with |- as can be seen in the following -example: -<nowiki> -{| -|a||b -|- -|c||d -|} -</nowiki> -{| -|a||b -|- -|c||d -|} -Headings for columns and rows can be made by using an exclamation point, !, -rather than a vertical bar |. For example, -<nowiki> -{| -!a!!b -|- -|c||d -|} -</nowiki> -{| -!a!!b -|- -|c||d -|} -Captions can be added using the + symbol: -<nowiki> -{| -|+ My Caption -!a!!b -|- -|c||d -|} -</nowiki> -{| -|+ My Caption -!a!!b -|- -|c||d -|} -Finally, you can put a CSS class or style attributes (or both) on the first line -of the table to further control how it looks: -<nowiki> -{| class="wikitable" -|+ My Caption -!a!!b -|- -|c||d -|} -</nowiki> -{| class="wikitable" -|+ My Caption -!a!!b -|- -|c||d -|} -Within a cell attributes like align, valign, styles, and class can be used. For -example, -<nowiki> -{| -| style="text-align:right;"| a| b -|- -| lalala | lalala -|} -</nowiki> -{| -| style="text-align:right;"| a| b -|- -| lalala | lalala -|} - -==Math== - -: Math can be included into a wiki document by either using the math tag: -<nowiki> -<math> -\sum_{i=1}^{n} i = frac{(n+1)(n)}{2} -</math> -</nowiki> - -<math> -\sum_{i=1}^{n} i = frac{(n+1)(n)}{2} -</math> - -or by enclosing the math in backticks: - -<pre> -`[[1, -2],[3,4]]` -</pre> - -`[[1, -2],[3,4]]`. - -Rendering of math is done using [[https://www.mathjax.org/|MathJax]], making us of the [[https://en.wikipedia.org/wiki/ASCIIMathML|ASCIImathml]] extensions. - -==Links and Relationships== -: A hypertext link to another document can be inserted into a wiki page using -the chain link icon in the GUI. Alternatively, there are several techniques -for inserting a link into a page depending on whether the link is to a page -within the same wiki group, is a link to a page on a different wiki -group, or is a link to a different website. In addition to normal -hypertext links, Yioop also supports relationship links. - -'''Intra-Group Wiki Links''' use the syntax: -<nowiki> -[[name_of_wiki_page]] -or -[[name_of_wiki_page|text for the link]] -or -[[name_of_wiki_page#heading_or_id_on_page|text for the link]] -</nowiki> -for example, to make a link to this Syntax page one could write, -<nowiki> -[[Syntax|Yioop Wiki Syntax Page]] -</nowiki> -which would look like, - -[[Syntax|Yioop Wiki Syntax Page]] - -'''Inter-Group Wiki Links''' use the syntax: -<nowiki> -[[name_of-group@name_of_wiki_page|text for the link]] -</nowiki> - -'''Different Website Links''' use the syntax: -<nowiki> -[[website_url|text for the link]] -</nowiki> - -: Relationships are a generalized form of link. They are used to express -a more complicated linking between two wiki pages and have the syntax: - -<nowiki> -[[relationship_type|wiki_page_name|text for the link]] -</nowiki> - -: In the navigation dropdown for a Yioop wiki page there are items for -what links to the current page and what relates to the current page -based on the links and relationships a page belongs to. - -==Recent Places Dropdowns== -: You can add a dropdown that can allow users to navigate to recently visited -wiki pages using the syntax: - -<sub>`[`{recent_places}]</sub> - -This looks like: - -[{recent_places}] - -==Adding Resources to a Page== - -: Yioop wiki syntax supports adding search bars, audio, images, and video to a -page. The magnifying class edit tool icon can be used to add a search bar via -the GUI. This can also be added by hand with the syntax: -<nowiki> -{{search:default|size:small|placeholder:Search Placeholder Text}} -</nowiki> -This syntax is split into three parts each separated by a vertical bar |. The -first part search:default means results from searches should come from the -default search index. You can replace default with the timestamp of a specific -index or mix if you do not want to use the default. The second group size:small -indicates the size of the search bar to be drawn. Choices of size are small, -medium, and large. Finally, placeholder:Search Placeholder Text indicates the -grayed out background text in the search input before typing is done should -read: Search Placeholder Text. Here is what the above code outputs: - -{{search:default|size:small|placeholder:Search Placeholder Text}} - -: Image, video and other media resources can be associated with a page by dragging -and dropping them in the edit textarea or by clicking on the link click to select -link in the gray box below the textarea. This would add wiki code such as - -<sub>((resource`:`myphoto.jpg|Resource Description))</sub> - -to the page. Only saving the page will save this code and upload the resource to -the server. In the above ''myphoto.jpg'' is the resource that will be inserted and -Resource Description is the alternative text to use in case the viewing browser -cannot display jpg files. To add a resource -from a different wiki page belonging to the same group to the current wiki -page one can use a syntax like: - -<sub>((resource`:`Documentation:ConfigureScreenForm1.png|The work directory form))</sub> - -Here Documentation would be the page and ConfigureScreenForm1.png the resource. -You can also insert resources from a data-string using ''resource-data'' rather than -''resource''. For example: - -<sub>((resource-data`:`image/jpeg;base64,/9j/4AAQSkZJRg...rest of image data...|Seekquarry Logo))</sub> - -could be used to inline an image like: - -((resource-|The Seekquarry Logo)) - -be aware though that the default maximum wiki page size is 512Kb (this can be set in src/configs/Config.php). - -: Sometimes it is useful to edit the basic resource link -above to make a link which is a thumbnail of the resource which points to a -separate page containing that resource. This can be done using the syntax: - -<sub>((resource-thumb`:`myphoto.jpg|Resource Description))</sub> - -: Similarly, by default for resources like PDFs, epub's, etc., the resource tag inlines -the whole resource into the page, if instead one wants a clickable link to a page where -the resource is displayed one can use the syntax: - -<sub>((resource-link`:`my_document.pdf|Resource Description))</sub> - -: Comma separated value files (.csv or CSV files) are inlined into a page as a table. Which rows and columns of the CSV to present in this table can be controlled by the resource line. The general format for including -a CSV resource is: - -<sub> ((resource`:`resource_name.csv#config#top_left_cell#bottom_right_cell|Resource Description))</sub> - -For example, - -<sub>((resource`:`resource_name.csv##B2#C3|Resource Description))</sub> - -might output - -((resource-data:text/csv;base64,LCwsLAosLTIsMywsCiw1LDQsLAosLCwsCiwsLCwK##B2#C3|Example CSV with Headings)) - -I.e., just the portion of the CSV given by the rectangle between the cells B2 and C3. Using a config directive we can omit the spreadsheet row and column headings as follows: - -<sub>((resource`:`resource_name.csv#noheadings#B2#C3|Resource Description)) </sub> - -which might output - - -((resource-data:text/csv;base64,LCwsLAosLTIsMywsCiw1LDQsLAosLCwsCiwsLCwK#noheadings#B2#C3|Example CSV without Headings)) - -CSV spreadsheet files can also be used to output a variety of charts. The general format for the command to insert a chart resource is: - -<sub>((resource-chart_type`:`resource_name.csv#char_config#x_start#x_end#y_start#y_end|Resource Description))</sub> - -Here ''chart_type'' can be one of ''bargraph'', ''linegraph'', or ''pointgraph''. For example, one might have a line like: - -<sub>((resource-bargraph`:`resource_name.csv##B1#B4#C1#C4|Quadratic Function)) </sub> - -which could produce a chart like - -((resource-bargraph:##(1,1)#(2,4)#(3,9)#(4,16)|Quadratic Function)) - -In the above example, the values for the `x` coordinates would come from the cells B1, B2, B3, B4 from -''resource_name.csv '' and the values for the `y` coordinates would come from cells C1, C2, C3, C4 from -''resource_name.csv ''. Alternatively, rather than use a CSV to get out data we can just list the points we want to plot with a command like: - -<sub>((resource-bargraph`:`##(1,1)#(2,4)#(3,9)#(4,16)|Quadratic Function))</sub> - -==Manipulating Page Resources== - -: A list of media that have already been associated with -a page appears under the Page Resource heading below the textarea. This -table allows the user to rename and delete resources as well as insert the -same resource at multiple locations within the same document. - -: The resources section of the edit page can be thought of as similar to -a folder in Windows or MacOS. One can have subfolders of the resource folder. - -: The '''Places''' dropdown at the top of the '''Page Resource''' section allows one to navigate -these folders. - -: The '''Filter''' textfield lets you enter a search string. -Clicking '''Go''' then shows only those resources -which contain that search string in their title. - -: The '''Clip Folder''' dropdown is used to copy files between folders and pages. -Its current value is the folder that the '''Clip Copy''' buttons next to resources -will copy their resource to when clicked. You can set the '''Clip Folder''' to -the current folder using the dropdown, then move to the page and folder that -you would like to copy stuff from and click the '''Clip Copy''' button of the -desired resource. - -: The '''Name''', '''Size''', '''Modified''' header links above the resources list -control the sort order for the resource list. If a page is a media list page, -then even in read mode, the sort order selected is remembered when drawing the -media list. - -: The '''Actions''' drop can be used to create new folders, new text files, and new csv -text files within the current page resource folder. These are initial named beginning -with ''untitled'' followed by some number, and if applicable a file extension. - -: Resources entries for the resources list consist first of an icon, followed by a textfield -with a name for the resource, followed by buttons for actions that can be done to that resource -(Rename, Add to Page, Clip Copy), followed by a link [X], which can be used to delete the resource. -If a resource is editable the icon will look like a plus sign together with a pencil. Clicking -on the icon will then let you edit the resource. - -===Text and CSV Resources=== -: For normal text files clicking edit will bring up a textarea with the context of the text to edit. -For CSV (comma separated value) files this will present the file as an editable spreadsheet. -Yioop spreadsheets can have equation much like Excel spreadsheets. Clicking on a cell lets one -edit its contents. For example, if in the cell A3 -one entered the equation: - = A1+A2 -then clicking out of the cell would cause it to refresh with the value of the sum of the contents of -cells A1 and A2. In addition, to the standard arithmetic operators ['*', '/', '+', '-', '%'], the -spreadsheet expressions can use float or integer literals, and can make use of the following table -of built-in functions: - -{| class="wikitable" -!Function Name!!Description -|- -!avg(x1,...,xn), avg(x1:xn)|| computes average of values of cells listed as arguments -|- -!ceil(x)|| rounds the value of x up to nearest integer -|- -!cell(i,j)|| returns the contents of the cell with column name of letter j, and row name i. For example, cell(2,'B') would return the contents of cell B2. -|- -!col(value, search_row, start_col, end_col)|| searches the row ''search_row'' between the columns -''start_col'', ''end_col'' for ''value''. Returns the column name where this value was found or -1 if not found. -For example, col(3, 2, "B", "D") might return C if the cell C2 had value 3. -|- -!exp(x)|| computes `e^x` -|- -!floor(x)|| rounds the value of x down to the nearest integer -|- -!log(x)|| computes `log x` -|- -!min(x1,...,xn), min(x1:xn)|| computes minimum value of cells listed as arguments -|- -!max(x1,...,xn), max(x1:xn)|| computes maximum value of cells listed as arguments -|- -!pow(x,y)|| computes `x^y` -|- -!row(value, search_col, start_row, end_row)|| searches the column ''search_col'' -between the rows ''start_row'', ''end_row'' for ''value''. -Returns the row name where this value was found or -1 if not found. -For example, row(3, "C", "1", "5") might return 2 if the cell C2 had value 3. -|- -!sqrt(x)|| computes `sqrt(x)` -|- -!sum(x1,...,xn), sum(x1:xn)|| computes sum of values of cells listed as arguments -|- -!username()|| returns username of the person using this CSV file -|} - -===HTML, PDF and EPub Resources=== -: How HTML, PDF, EPub resources included on a page render depends on how the Yioop wiki software -has been configured. If no special configuration has been done, then HTML and PDF documents -will bbe rendered in an <iframe> tag within the current wiki page. In the EPub, case a link -to download the resource will be given. If the wiki software detects the presence of the -file APP_DIR/scripts/pdf.js ([[https://en.wikipedia.org/wiki/PDF.js|PDF.js]]) -or APP_DIR/scripts/epub.js ([[https://github.com/futurepress/epub.js|epub.js]]), the wiki -system will render the resource in a Javascript viewer and will do things like remember reading -position. - - -===Video and Audio Resources=== - -: Not all browsers support the same video and audio formats for playback. For this reason -it sometimes is useful to have multiple video resources for the same video. For example, -you might have a .ogv and .vp8 version of the same video recording. In read (non-edit) -mode, the Yioop wiki system displays only one link for video or audio files that have -the same name except for extension. It then includes the grouped file as separated <source> -tags within either the <video> or <audio> html tag used to render the item in the browser. -In this way, you can make your media take best advantages to whatever capabilities your -client's browser has. If you don't feel like recoding your media in such a fancy way, a safe -rule of thumb is that .mp3 audio will playback in all modern browser, and that .mp4 video -will playback in all modern browser. - -: For video it is sometimes useful to add a subtitle or caption track. Yioop wiki supports -[[https://en.wikipedia.org/wiki/WebVTT|WebVTT]] format subtitles and captions. To see how -Yioop wiki makes use of these files, suppose you included a resource ''foo.mp4'' in your -wiki pages, and you also had a file named ''foo-captions-en-US.vtt'' then when the HTML -page is generated from your wiki page, a <track> tag for the caption file would be added -to the <video> tag. A user seeing this page would then see in the video player a closed caption -symbol and be able to turn on/off (defaults off) the English captions. If you wanted -named the file ''foo-subtitles-en-US.vtt'' instead, then Yioop wiki would include it as a -subtitles track (defaults on). You can add captions/subtitle files for as many languages as -desired. - -: When viewing the page resources for a page in edit mode, one can see one file/resource and -no grouping of resources by name is done. In this way you can keep track of exactly what -resources are available for a page. - -==Page Settings, Page Type== - -: In edit mode for a wiki page, next to the page name, is a link [Settings]. -Clicking this link expands a form which can be used to control global settings -for a wiki page. This form contains a drop down for the page type, another -drop down for the type of border for the page in non-logged in mode, -a checkbox for whether a table of contents should be auto-generated from level 2 -and level three headings and then text -fields or areas for the page title, author, meta robots, and page description. -Beneath this one can specify another wiki page to be used as a header for this -page and also specify another wiki page to be used as a footer for this page. - -: The contents of the page title is displayed in the browser title when the -wiki page is accessed with the Activity Panel collapsed or when not logged in. -Similarly, in the collapsed or not logged in mode, if one looks as the HTML -page source for the page, in the head of document, <meta> tags for author, -robots, and description are set according to these fields. These fields can -be useful for search engine optimization. The robots meta tag can be -used to control how search engine robots index the page. Wikipedia has more information on -[[https://en.wikipedia.org/wiki/Meta_element|Meta Elements]]. - -: The '''Standard''' page type treats the page as a usual wiki page. - -: '''Page Alias''' type redirects the current page to another page name. This can -be used to handle things like different names for the same topic or to do localization -of pages. For example, if you switch the locale from English to French and -you were on the wiki page dental_floss when you switch to French the article -dental_floss might redirect to the page dentrifice. - -: '''Media List''' type means that the page, when read, should display just the -resources in the page as a list of thumbnails and links. These links for the -resources go to a separate pages used to display these resources. -This kind of page is useful for a gallery of -images or a collection of audio or video files. - -: '''Presentation''' type is for a wiki page whose purpose is a slide presentation. In this mode, -.... -on a line by itself is used to separate one slide. If presentation type is a selected a new -slide icon appears in the wiki edit bar allowining one to easily add new slides. -When the Activity panel is not collapsed and you are reading a presentation, it just -displays as a single page with all slides visible. Collapsing the Activity panel presents -the slides as a typical slide presentation using the +END_HEAD_VARS=Yioop Wiki Syntax= + +: Wiki syntax is a lightweight way to markup a text document so that +it can be formatted and drawn nicely by Yioop. +This page briefly describes the wiki syntax supported by Yioop. + +==Headings== +: In wiki syntax headings of documents and sections are written as follows: + +<nowiki> +=Level1= +==Level2== +===Level3=== +====Level4==== +=====Level5===== +======Level6====== +</nowiki> + +and would look like: + +=Level1= +==Level2== +===Level3=== +====Level4==== +=====Level5===== +======Level6====== + +==Paragraphs== +: In Yioop two new lines indicates a new paragraph. You can control +the indent of a paragraph by putting colons followed by a space in front of it: + +<nowiki> +: some indent + +:: a little more + +::: even more + +:::: that's sorta crazy +</nowiki> + +which looks like: + +: some indent + +:: a little more + +::: even more + +:::: that's sorta crazy + +==Horizontal Rule== +: Sometimes it is convenient to separate paragraphs or sections with a horizontal +rule. This can be done by placing four hyphens on a line by themselves: +<nowiki> +---- +</nowiki> +This results in a line that looks like: +---- + +==Text Formatting Within Paragraphs== +: Within a paragraph it is often convenient to make some text bold, italics, +underlined, etc. Below is a quick summary of how to do this: +===Wiki Markup=== +{| +|<nowiki>''italic''</nowiki>|''italic'' +|- +|<nowiki>'''bold'''</nowiki>|'''bold''' +|- +|<nowiki>'''''bold and italic'''''</nowiki>|'''''bold and italic''''' +|} + +===HTML Tags=== +: Yioop also supports several html tags such as: +{| +|<nowiki><del>delete</del></nowiki>|<del>delete</del> +|- +|<nowiki><ins>insert</ins></nowiki>|<ins>insert</ins> +|- +|<nowiki><s>strike through</s> or +<strike>strike through</strike> </nowiki>|<s>strike through</s> +|- +|<nowiki><sup>superscript</sup> and +<sub>subscript</sub></nowiki>|<sup>superscript</sup> and +<sub>subscript</sub> +|- +|<nowiki><tt>typewriter</tt></nowiki>|<tt>typewriter</tt> +|- +|<nowiki><u>underline</u></nowiki>|<u>underline</u> +|} + +===Spacing within Paragraphs=== +: The HTML entity +<nowiki>&nbsp;</nowiki> +can be used to create a non-breaking space. The tag +<nowiki><br></nowiki> +can be used to produce a line break. + +==Preformatted Text and Unformatted Text== +: You can force text to be formatted as you typed it rather +than using the layout mechanism of the browser using the +<nowiki><pre>preformatted text tag.</pre></nowiki> +Alternatively, a sequence of lines all beginning with a +space character will also be treated as preformatted. + +: Wiki markup within pre tags is still parsed by Yioop. +If you would like to add text that is not parsed, enclosed +it in <tt><`mbox{nowiki}`> </`mbox{nowiki}`></tt> tags. + +==Styling Text Paragraphs== +: Yioop wiki syntax offers a number of templates for +control the styles, and alignment of text for +a paragraph or group of paragraphs:<br /> +`{{`left| some text`}}`,<br /> `{{`right| some text`}}`,<br /> +and<br /> +`{{`center| some text`}}`<br /> can be used to left-justify, +right-justify, and center a block of text. For example, +the last command, would produce: +{{center| +some text +}} +If you know cascading style sheets (CSS), you can set +a class or id selector for a block of text using:<br /> +`{{`class="my-class-selector" some text`}}`<br />and<br /> +`{{`id="my-id-selector" some text`}}`.<br /> +You can also apply inline styles to a block of text +using the syntax:<br /> +`{{`style="inline styles" some text`}}`.<br /> +For example, `{{`style="color:red" some text`}}` looks +like {{style="color:red" some text}}. + +==Lists== +: The Yioop Wiki Syntax supported of ways of listing items: +bulleted/unordered list, numbered/ordered lists, and +definition lists. Below are some examples: + +===Unordered Lists=== +<nowiki> +* Item1 +** SubItem1 +** SubItem2 +*** SubSubItem1 +* Item 2 +* Item 3 +</nowiki> +would be drawn as: +* Item1 +** SubItem1 +** SubItem2 +*** SubSubItem1 +* Item 2 +* Item 3 + +===Ordered Lists=== +<nowiki> +# Item1 +## SubItem1 +## SubItem2 +### SubSubItem1 +# Item 2 +# Item 3 +</nowiki> +# Item1 +## SubItem1 +## SubItem2 +### SubSubItem1 +# Item 2 +# Item 3 + +===Mixed Lists=== +<nowiki> +# Item1 +#* SubItem1 +#* SubItem2 +#*# SubSubItem1 +# Item 2 +# Item 3 +</nowiki> +# Item1 +#* SubItem1 +#* SubItem2 +#*# SubSubItem1 +# Item 2 +# Item 3 + +===Definition Lists=== +<nowiki> +;Term 1: Definition of Term 1 +;Term 2: Definition of Term 2 +</nowiki> +;Term 1: Definition of Term 1 +;Term 2: Definition of Term 2 + +==Tables== +: A table begins with {`|` and ends with `|`}. Cells are separated with | and +rows are separated with |- as can be seen in the following +example: +<nowiki> +{| +|a||b +|- +|c||d +|} +</nowiki> +{| +|a||b +|- +|c||d +|} +Headings for columns and rows can be made by using an exclamation point, !, +rather than a vertical bar |. For example, +<nowiki> +{| +!a!!b +|- +|c||d +|} +</nowiki> +{| +!a!!b +|- +|c||d +|} +Captions can be added using the + symbol: +<nowiki> +{| +|+ My Caption +!a!!b +|- +|c||d +|} +</nowiki> +{| +|+ My Caption +!a!!b +|- +|c||d +|} +Finally, you can put a CSS class or style attributes (or both) on the first line +of the table to further control how it looks: +<nowiki> +{| class="wikitable" +|+ My Caption +!a!!b +|- +|c||d +|} +</nowiki> +{| class="wikitable" +|+ My Caption +!a!!b +|- +|c||d +|} +Within a cell attributes like align, valign, styles, and class can be used. For +example, +<nowiki> +{| +| style="text-align:right;"| a| b +|- +| lalala | lalala +|} +</nowiki> +{| +| style="text-align:right;"| a| b +|- +| lalala | lalala +|} + +==Math== + +: Math can be included into a wiki document by either using the math tag: +<nowiki> +<math> +\sum_{i=1}^{n} i = frac{(n+1)(n)}{2} +</math> +</nowiki> + +<math> +\sum_{i=1}^{n} i = frac{(n+1)(n)}{2} +</math> + +or by enclosing the math in backticks: + +<pre> +`[[1, -2],[3,4]]` +</pre> + +`[[1, -2],[3,4]]`. + +Rendering of math is done using [[https://www.mathjax.org/|MathJax]], making us of the [[https://en.wikipedia.org/wiki/ASCIIMathML|ASCIImathml]] extensions. + +==Links and Relationships== +: A hypertext link to another document can be inserted into a wiki page using +the chain link icon in the GUI. Alternatively, there are several techniques +for inserting a link into a page depending on whether the link is to a page +within the same wiki group, is a link to a page on a different wiki +group, or is a link to a different website. In addition to normal +hypertext links, Yioop also supports relationship links. + +'''Intra-Group Wiki Links''' use the syntax: +<nowiki> +[[name_of_wiki_page]] +or +[[name_of_wiki_page|text for the link]] +or +[[name_of_wiki_page#heading_or_id_on_page|text for the link]] +</nowiki> +for example, to make a link to this Syntax page one could write, +<nowiki> +[[Syntax|Yioop Wiki Syntax Page]] +</nowiki> +which would look like, + +[[Syntax|Yioop Wiki Syntax Page]] + +'''Inter-Group Wiki Links''' use the syntax: +<nowiki> +[[name_of-group@name_of_wiki_page|text for the link]] +</nowiki> + +'''Different Website Links''' use the syntax: +<nowiki> +[[website_url|text for the link]] +</nowiki> + +: Relationships are a generalized form of link. They are used to express +a more complicated linking between two wiki pages and have the syntax: + +<nowiki> +[[relationship_type|wiki_page_name|text for the link]] +</nowiki> + +: In the navigation dropdown for a Yioop wiki page there are items for +what links to the current page and what relates to the current page +based on the links and relationships a page belongs to. + +==Recent Places Dropdowns== +: You can add a dropdown that can allow users to navigate to recently visited +wiki pages using the syntax: + +<sub>`[`{recent_places}]</sub> + +This looks like: + +[{recent_places}] + +==Adding Resources to a Page== + +: Yioop wiki syntax supports adding search bars, audio, images, and video to a +page. The magnifying class edit tool icon can be used to add a search bar via +the GUI. This can also be added by hand with the syntax: +<nowiki> +{{search:default|size:small|placeholder:Search Placeholder Text}} +</nowiki> +This syntax is split into three parts each separated by a vertical bar |. The +first part search:default means results from searches should come from the +default search index. You can replace default with the timestamp of a specific +index or mix if you do not want to use the default. The second group size:small +indicates the size of the search bar to be drawn. Choices of size are small, +medium, and large. Finally, placeholder:Search Placeholder Text indicates the +grayed out background text in the search input before typing is done should +read: Search Placeholder Text. Here is what the above code outputs: + +{{search:default|size:small|placeholder:Search Placeholder Text}} + +: Image, video and other media resources can be associated with a page by dragging +and dropping them in the edit textarea or by clicking on the link click to select +link in the gray box below the textarea. This would add wiki code such as + +<sub>((resource`:`myphoto.jpg|Resource Description))</sub> + +to the page. Only saving the page will save this code and upload the resource to +the server. In the above ''myphoto.jpg'' is the resource that will be inserted and +Resource Description is the alternative text to use in case the viewing browser +cannot display jpg files. To add a resource +from a different wiki page belonging to the same group to the current wiki +page one can use a syntax like: + +<sub>((resource`:`Documentation:ConfigureScreenForm1.png|The work directory form))</sub> + +Here Documentation would be the page and ConfigureScreenForm1.png the resource. +You can also insert resources from a data-string using ''resource-data'' rather than +''resource''. For example: + +<sub>((resource-data`:`image/jpeg;base64,/9j/4AAQSkZJRg...rest of image data...|Seekquarry Logo))</sub> + +could be used to inline an image like: + +((resource-|The Seekquarry Logo)) + +be aware though that the default maximum wiki page size is 512Kb (this can be set in src/configs/Config.php). + +: Sometimes it is useful to edit the basic resource link +above to make a link which is a thumbnail of the resource which points to a +separate page containing that resource. This can be done using the syntax: + +<sub>((resource-thumb`:`myphoto.jpg|Resource Description))</sub> + +: Similarly, by default for resources like PDFs, epub's, etc., the resource tag inlines +the whole resource into the page, if instead one wants a clickable link to a page where +the resource is displayed one can use the syntax: + +<sub>((resource-link`:`my_document.pdf|Resource Description))</sub> + +: Comma separated value files (.csv or CSV files) are inlined into a page as a table. Which rows and columns of the CSV to present in this table can be controlled by the resource line. The general format for including +a CSV resource is: + +<sub> ((resource`:`resource_name.csv#config#top_left_cell#bottom_right_cell|Resource Description))</sub> + +For example, + +<sub>((resource`:`resource_name.csv##B2#C3|Resource Description))</sub> + +might output + +((resource-data:text/csv;base64,LCwsLAosLTIsMywsCiw1LDQsLAosLCwsCiwsLCwK##B2#C3|Example CSV with Headings)) + +I.e., just the portion of the CSV given by the rectangle between the cells B2 and C3. Using a config directive we can omit the spreadsheet row and column headings as follows: + +<sub>((resource`:`resource_name.csv#noheadings#B2#C3|Resource Description)) </sub> + +which might output + + +((resource-data:text/csv;base64,LCwsLAosLTIsMywsCiw1LDQsLAosLCwsCiwsLCwK#noheadings#B2#C3|Example CSV without Headings)) + +CSV spreadsheet files can also be used to output a variety of charts. The general format for the command to insert a chart resource is: + +<sub>((resource-chart_type`:`resource_name.csv#char_config#x_start#x_end#y_start#y_end|Resource Description))</sub> + +Here ''chart_type'' can be one of ''bargraph'', ''linegraph'', or ''pointgraph''. For example, one might have a line like: + +<sub>((resource-bargraph`:`resource_name.csv##B1#B4#C1#C4|Quadratic Function)) </sub> + +which could produce a chart like + +((resource-bargraph:##(1,1)#(2,4)#(3,9)#(4,16)|Quadratic Function)) + +In the above example, the values for the `x` coordinates would come from the cells B1, B2, B3, B4 from +''resource_name.csv '' and the values for the `y` coordinates would come from cells C1, C2, C3, C4 from +''resource_name.csv ''. Alternatively, rather than use a CSV to get out data we can just list the points we want to plot with a command like: + +<sub>((resource-bargraph`:`##(1,1)#(2,4)#(3,9)#(4,16)|Quadratic Function))</sub> + +==Manipulating Page Resources== + +: A list of media that have already been associated with +a page appears under the Page Resource heading below the textarea. This +table allows the user to rename and delete resources as well as insert the +same resource at multiple locations within the same document. + +: The resources section of the edit page can be thought of as similar to +a folder in Windows or MacOS. One can have subfolders of the resource folder. + +: The '''Places''' dropdown at the top of the '''Page Resource''' section allows one to navigate +these folders. + +: The '''Filter''' textfield lets you enter a search string. +Clicking '''Go''' then shows only those resources +which contain that search string in their title. + +: The '''Clip Folder''' dropdown is used to copy files between folders and pages. +Its current value is the folder that the '''Clip Copy''' buttons next to resources +will copy their resource to when clicked. You can set the '''Clip Folder''' to +the current folder using the dropdown, then move to the page and folder that +you would like to copy stuff from and click the '''Clip Copy''' button of the +desired resource. + +: The '''Name''', '''Size''', '''Modified''' header links above the resources list +control the sort order for the resource list. If a page is a media list page, +then even in read mode, the sort order selected is remembered when drawing the +media list. + +: The '''Actions''' drop can be used to create new folders, new text files, and new csv +text files within the current page resource folder. These are initial named beginning +with ''untitled'' followed by some number, and if applicable a file extension. + +: Resources entries for the resources list consist first of an icon, followed by a textfield +with a name for the resource, followed by buttons for actions that can be done to that resource +(Rename, Add to Page, Clip Copy), followed by a link [X], which can be used to delete the resource. +If a resource is editable the icon will look like a plus sign together with a pencil. Clicking +on the icon will then let you edit the resource. + +===Text and CSV Resources=== +: For normal text files clicking edit will bring up a textarea with the context of the text to edit. +For CSV (comma separated value) files this will present the file as an editable spreadsheet. +Yioop spreadsheets can have equation much like Excel spreadsheets. Clicking on a cell lets one +edit its contents. For example, if in the cell A3 +one entered the equation: + = A1+A2 +then clicking out of the cell would cause it to refresh with the value of the sum of the contents of +cells A1 and A2. In addition, to the standard arithmetic operators ['*', '/', '+', '-', '%'], the +spreadsheet expressions can use float or integer literals, and can make use of the following table +of built-in functions: + +{| class="wikitable" +!Function Name!!Description +|- +!avg(x1,...,xn), avg(x1:xn)|| computes average of values of cells listed as arguments +|- +!ceil(x)|| rounds the value of x up to nearest integer +|- +!cell(i,j)|| returns the contents of the cell with column name of letter j, and row name i. For example, cell(2,'B') would return the contents of cell B2. +|- +!col(value, search_row, start_col, end_col)|| searches the row ''search_row'' between the columns +''start_col'', ''end_col'' for ''value''. Returns the column name where this value was found or -1 if not found. +For example, col(3, 2, "B", "D") might return C if the cell C2 had value 3. +|- +!exp(x)|| computes `e^x` +|- +!floor(x)|| rounds the value of x down to the nearest integer +|- +!log(x)|| computes `log x` +|- +!min(x1,...,xn), min(x1:xn)|| computes minimum value of cells listed as arguments +|- +!max(x1,...,xn), max(x1:xn)|| computes maximum value of cells listed as arguments +|- +!pow(x,y)|| computes `x^y` +|- +!row(value, search_col, start_row, end_row)|| searches the column ''search_col'' +between the rows ''start_row'', ''end_row'' for ''value''. +Returns the row name where this value was found or -1 if not found. +For example, row(3, "C", "1", "5") might return 2 if the cell C2 had value 3. +|- +!sqrt(x)|| computes `sqrt(x)` +|- +!sum(x1,...,xn), sum(x1:xn)|| computes sum of values of cells listed as arguments +|- +!username()|| returns username of the person using this CSV file +|} + +===HTML, PDF and EPub Resources=== +: How HTML, PDF, EPub resources included on a page render depends on how the Yioop wiki software +has been configured. If no special configuration has been done, then HTML and PDF documents +will bbe rendered in an <iframe> tag within the current wiki page. In the EPub, case a link +to download the resource will be given. If the wiki software detects the presence of the +file APP_DIR/scripts/pdf.js ([[https://en.wikipedia.org/wiki/PDF.js|PDF.js]]) +or APP_DIR/scripts/epub.js ([[https://github.com/futurepress/epub.js|epub.js]]), the wiki +system will render the resource in a Javascript viewer and will do things like remember reading +position. + + +===Video and Audio Resources=== + +: Not all browsers support the same video and audio formats for playback. For this reason +it sometimes is useful to have multiple video resources for the same video. For example, +you might have a .ogv and .vp8 version of the same video recording. In read (non-edit) +mode, the Yioop wiki system displays only one link for video or audio files that have +the same name except for extension. It then includes the grouped file as separated <source> +tags within either the <video> or <audio> html tag used to render the item in the browser. +In this way, you can make your media take best advantages to whatever capabilities your +client's browser has. If you don't feel like recoding your media in such a fancy way, a safe +rule of thumb is that .mp3 audio will playback in all modern browser, and that .mp4 video +will playback in all modern browser. + +: For video it is sometimes useful to add a subtitle or caption track. Yioop wiki supports +[[https://en.wikipedia.org/wiki/WebVTT|WebVTT]] format subtitles and captions. To see how +Yioop wiki makes use of these files, suppose you included a resource ''foo.mp4'' in your +wiki pages, and you also had a file named ''foo-captions-en-US.vtt'' then when the HTML +page is generated from your wiki page, a <track> tag for the caption file would be added +to the <video> tag. A user seeing this page would then see in the video player a closed caption +symbol and be able to turn on/off (defaults off) the English captions. If you wanted +named the file ''foo-subtitles-en-US.vtt'' instead, then Yioop wiki would include it as a +subtitles track (defaults on). You can add captions/subtitle files for as many languages as +desired. + +: When viewing the page resources for a page in edit mode, one can see one file/resource and +no grouping of resources by name is done. In this way you can keep track of exactly what +resources are available for a page. + +==Page Settings, Page Type== + +: In edit mode for a wiki page, next to the page name, is a link [Settings]. +Clicking this link expands a form which can be used to control global settings +for a wiki page. This form contains a drop down for the page type, another +drop down for the type of border for the page in non-logged in mode, +a checkbox for whether a table of contents should be auto-generated from level 2 +and level three headings and then text +fields or areas for the page title, author, meta robots, and page description. +Beneath this one can specify another wiki page to be used as a header for this +page and also specify another wiki page to be used as a footer for this page. + +: The contents of the page title is displayed in the browser title when the +wiki page is accessed with the Activity Panel collapsed or when not logged in. +Similarly, in the collapsed or not logged in mode, if one looks as the HTML +page source for the page, in the head of document, <meta> tags for author, +robots, and description are set according to these fields. These fields can +be useful for search engine optimization. The robots meta tag can be +used to control how search engine robots index the page. Wikipedia has more information on +[[https://en.wikipedia.org/wiki/Meta_element|Meta Elements]]. + +: The '''Standard''' page type treats the page as a usual wiki page. + +: '''Page Alias''' type redirects the current page to another page name. This can +be used to handle things like different names for the same topic or to do localization +of pages. For example, if you switch the locale from English to French and +you were on the wiki page dental_floss when you switch to French the article +dental_floss might redirect to the page dentrifice. + +: '''Media List''' type means that the page, when read, should display just the +resources in the page as a list of thumbnails and links. These links for the +resources go to a separate pages used to display these resources. +This kind of page is useful for a gallery of +images or a collection of audio or video files. + +: '''Presentation''' type is for a wiki page whose purpose is a slide presentation. In this mode, +.... +on a line by itself is used to separate one slide. If presentation type is a selected a new +slide icon appears in the wiki edit bar allowining one to easily add new slides. +When the Activity panel is not collapsed and you are reading a presentation, it just +displays as a single page with all slides visible. Collapsing the Activity panel presents +the slides as a typical slide presentation using the [[www.w3.org/Talks/Tools/Slidy2/Overview.html|Slidy]] javascript. EOD; $public_pages["en-US"]["ad_program_terms"] = <<< 'EOD' @@ -2366,19 +2366,23 @@ robots= description= +alternative_path= + page_header= page_footer= -END_HEAD_VARSThese checkboxes control whether various links and drop downs on the search result and landing -pages appear or not. - -; '''Word Suggest''': Controls whether the suggested query drop down appear as a query is entered in the search bar and whether thesaurus results appear on search result pages. -; '''Subsearch''' : Controls whether the links to subsearches such as Image, Video, and News search appear at the top of all search pages -; '''Signin''' : Controls whether the '''Sign In''' link appears at the top of the Yioop landing and search result pages. -; '''Cache''', '''Similar''', '''Inlinks''', '''IP Address''': Control whether the corresponding links appear after each search result item. - +sort=aname +END_HEAD_VARSThese checkboxes control whether various links and drop downs on the search result and landing +pages appear or not. + +; '''Word Suggest''': Controls whether the suggested query drop down appear as a query is entered in the search bar. +; '''Subsearch''' : Controls whether the links to subsearches such as Image, Video, and News search appear at the top of all search pages +; '''Signin''' : Controls whether the '''Sign In''' link appears at the top of the Yioop landing and search result pages. +; '''Cache''', '''Similar''', '''Inlinks''', '''IP Address''': Control whether the corresponding links appear after each search result item. + + EOD; $help_pages["en-US"]["Seed_Sites_and_URL_Suggestions"] = <<< EOD diff --git a/src/controllers/SearchController.php b/src/controllers/SearchController.php index febb9ab14..f5ee96c9e 100755 --- a/src/controllers/SearchController.php +++ b/src/controllers/SearchController.php @@ -965,12 +965,12 @@ class SearchController extends Controller implements CrawlConstants $out_pages[$first_image]['IMAGES'] = []; } $out_pages[$first_image]['IMAGES'][] = $page; - } else if (!empty($page[self::IS_NEWS])) { + } else if (!empty($page[self::IS_FEED])) { if ($first_feed_item == -1) { $first_feed_item = count($out_pages); - $out_pages[$first_feed_item]['NEWS'] = []; + $out_pages[$first_feed_item]['FEED'] = []; } - $out_pages[$first_feed_item]['NEWS'][] = $page; + $out_pages[$first_feed_item]['FEED'][] = $page; } else { $out_pages[] = $page; } diff --git a/src/controllers/components/StoreComponent.php b/src/controllers/components/StoreComponent.php index 54f47d743..856f595a4 100644 --- a/src/controllers/components/StoreComponent.php +++ b/src/controllers/components/StoreComponent.php @@ -441,7 +441,7 @@ class StoreComponent extends Component $parent = $this->parent; $keywords = explode("," , strtoupper($data['KEYWORDS'])); array_walk($keywords, [C\NS_COMPONENTS . - "AdvertisementComponent", "trim_value"]); + "StoreComponent", "trim_value"]); $min_bid_reqd = 0; $expensive_bid = 0; foreach ($keywords as $keyword) { diff --git a/src/css/search.css b/src/css/search.css index d2a8a41c0..e2c83a4db 100755 --- a/src/css/search.css +++ b/src/css/search.css @@ -1285,20 +1285,6 @@ ul.in-list li top: -0.8in; width: 8in; } -.html-ltr .thesaurus-serp-results -{ - left: 2.2in; - position: relative; - top: -1.7in; - width: 8in; -} -.html-rtl .thesaurus-serp-results -{ - right: 2.2in; - position: relative; - top: -1.7in; - width: 8in; -} .html-rtl .serp { position: relative; @@ -1353,27 +1339,6 @@ ul.in-list li top:7px; width:100px; } - -.html-ltr .thesaurus -{ - color: #666; - font-size: 14pt; - font-weight: bold; - left: 0.2in; - position: relative; - top: -0.8in; - width: 8in; -} -.html-rtl .thesaurus -{ - color: #666; - font-size: 14pt; - font-weight: bold; - right: 0.2in; - position: relative; - top: -0.8in; - width: 8in; -} .result { clear: both; diff --git a/src/executables/ArcTool.php b/src/executables/ArcTool.php index f098830af..f498bddde 100755 --- a/src/executables/ArcTool.php +++ b/src/executables/ArcTool.php @@ -256,49 +256,66 @@ class ArcTool implements CrawlConstants echo "\nBundle Name: $bundle_name\n"; $archive_type = $this->getArchiveKind($archive_path); echo "Bundle Type: $archive_type\n"; - if (strcmp($archive_type,"IndexArchiveBundle") != 0) { $this->badFormatMessageAndExit($archive_path, "index"); } $index_timestamp = substr($archive_path, strpos($archive_path, self::index_data_base_name) + strlen(self::index_data_base_name)); - $mask = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"; - $hash_key = L\crawlHashWord($word, true, $mask) ; - $start_time = microtime(true); - $info = IndexManager::getWordInfo($index_timestamp, $hash_key, 0, - $mask, -1, $start_generation, $num_generations); - echo "Dictionary Lookup Time:" . L\changeInMicrotime($start_time)."\n"; - if (!$info) { + $hash_paths = L\allCrawlHashPaths($word, true); + $found = false; + echo "!!Performing Looking up for phrase " . + "at each possible shift position. Outputting results for each ". + "possibility!!\n"; + foreach ($hash_paths as $hash_shift) { + if (is_array($hash_shift)) { + list($hash_key, $shift) = $hash_shift; + } else { + $hash_key = $hash_shift; + $shift = 0; + } + $start_time = microtime(true); + echo "Looking up in dictionary:\n"; + echo " Key: ". L\toHexString($hash_key) . "\n"; + echo " Shift: ". $shift . "\n"; + $info = IndexManager::getWordInfo($index_timestamp, $hash_key, + $shift, -1, $start_generation, $num_generations); + echo "Dictionary Lookup Time:" . L\changeInMicrotime($start_time) + . "\n"; + if (!$info) { + echo " Key not found\n"; + continue; + } + $found = true; + echo "Dictionary Tiers: "; + $index = IndexManager::getIndex($index_timestamp); + $tiers = $index->dictionary->active_tiers; + foreach ($tiers as $tier) { + echo " $tier"; + } + echo "\nBundle Dictionary Entries for '$word':\n"; + echo "====================================\n"; + $i = 1; + foreach ($info as $record) { + echo "RECORD: $i\n"; + echo "Hex ID: " . L\toHexString($record[4])."\n"; + echo "GENERATION: {$record[0]}\n"; + echo "FIRST WORD OFFSET: {$record[1]}\n"; + echo "LAST WORD OFFSET: {$record[2]}\n"; + echo "NUMBER OF POSTINGS: {$record[3]}\n\n"; + $i++; + } + } + if (!$found) { //fallback to old word hashes $info = IndexManager::getWordInfo($index_timestamp, - L\crawlHash($word, true), 0, "", 1, $start_generation, + L\crawlHash($word, true), 0, 1, $start_generation, $num_generations); if (!$info) { - echo "\n$word does not appear in bundle!\n\n"; + echo "\n$word does not appear in bundle!\n"; exit(); } } - echo "Dictionary Tiers: "; - $index = IndexManager::getIndex($index_timestamp); - $tiers = $index->dictionary->active_tiers; - foreach ($tiers as $tier) { - echo " $tier"; - } - echo "\nBundle Dictionary Entries for '$word':\n"; - echo "====================================\n"; - $i = 1; - foreach ($info as $record) { - echo "RECORD: $i\n"; - echo "Hex ID: ".L\toHexString($record[4])."\n"; - echo "Media Type: " . PhraseParser::getMediaType($record[4]) . "\n"; - echo "Safe: ". PhraseParser::getSafety($record[4]) . "\n"; - echo "GENERATION: {$record[0]}\n"; - echo "FIRST WORD OFFSET: {$record[1]}\n"; - echo "LAST WORD OFFSET: {$record[2]}\n"; - echo "NUMBER OF POSTINGS: {$record[3]}\n\n"; - $i++; - } } /** * Prints information about the number of words and frequencies of words @@ -568,11 +585,12 @@ class ArcTool implements CrawlConstants $max_generation = max($max_generation, $generation); } for ($i = $start_shard; $i < $max_generation + 1; $i++) { - $shard_name = $path."/posting_doc_shards/index$i"; + $shard_name = $path . "/posting_doc_shards/index$i"; echo "\nShard $i of $num_shards\n"; $shard = new IndexShard($shard_name, $i, C\NUM_DOCS_PER_GENERATION, true); if ($dictionary->addShardDictionary($shard)) { + $shard->saveWithoutDictionary(); file_put_contents($shard_count_file, $i + 1); } else { echo "Problem adding shard $i"; @@ -929,7 +947,7 @@ class ArcTool implements CrawlConstants $triplet_lists = PhraseParser::extractPhrasesInLists($phrase_string, $lang); - $word_lists = $triplet_list['WORD_LIST']; + $word_lists = $triplet_lists['WORD_LIST']; $len = strlen($phrase_string); if (PhraseParser::computeSafeSearchScore($word_lists, $len) < 0.012) { @@ -952,8 +970,7 @@ class ArcTool implements CrawlConstants $doc_keys .= $score_keys; } $shard->addDocumentWords($doc_keys, $offset, - $word_lists, $meta_ids, - PhraseParser::$materialized_metas, true, false); + $word_lists, $meta_ids, true, false); $offset = $object[0]; } $seen_partition += $num_to_get; @@ -963,7 +980,6 @@ class ArcTool implements CrawlConstants } $this->reindexIndexArchive($archive_path); } - /** * Used to create an archive_bundle_iterator for a non-yioop archive * As these iterators sometimes make use of a folder to store savepoints diff --git a/src/executables/Fetcher.php b/src/executables/Fetcher.php index 3d92faa31..031baece0 100755 --- a/src/executables/Fetcher.php +++ b/src/executables/Fetcher.php @@ -2865,8 +2865,7 @@ class Fetcher implements CrawlConstants } $this->found_sites[self::INVERTED_INDEX][$this->current_server ]->addDocumentWords($doc_keys, self::NEEDS_OFFSET_FLAG, - $word_lists, $meta_ids, PhraseParser::$materialized_metas, - true, $doc_rank); + $word_lists, $meta_ids, true, $doc_rank); if (isset($word_and_qa_lists['QUESTION_ANSWER_LIST'])) { $site[self::QUESTION_ANSWERS] = $word_and_qa_lists['QUESTION_ANSWER_LIST']; @@ -2882,6 +2881,9 @@ class Fetcher implements CrawlConstants */ if (!$this->no_process_links && !isset($site[self::JUST_METAS]) && !isset($this->programming_language_extension[$lang])) { + $tokenizer = PhraseParser::getTokenizer($lang); + $has_stopwords_remover = + method_exists($tokenizer, "stopwordsRemover"); foreach ($site[self::LINKS] as $url => $link_text) { /* this mysterious check means won't index links from robots.txt. Sitemap will still be in TO_CRAWL, but that's @@ -2900,17 +2902,25 @@ class Fetcher implements CrawlConstants } $elink_flag = ($link_host != $host) ? true : false; $link_text = strip_tags($link_text); + if ($has_stopwords_remover) { + $useful_text = $tokenizer->stopwordsRemover($link_text); + } else { + $useful_text = $link_text; + } + if (mb_strlen($useful_text) < C\MIN_LINKS_TEXT_DOC) { + continue; + } $ref = ($elink_flag) ? "eref" : "iref"; $url = str_replace('|', "%7C", $url); $link_id = - "url|".$url."|text|".urlencode($link_text). - "|$ref|".$site_url; + "url|" . $url . "|text|" . urlencode($link_text) . + "|$ref|" . $site_url; $elink_flag_string = ($elink_flag) ? "e" : "i"; $link_keys = L\crawlHash($url, true) . L\crawlHash($link_id, true) . $elink_flag_string. - substr(L\crawlHash($host."/", true), 1); + substr(L\crawlHash($host . "/", true), 1); $summary[self::URL] = $link_id; $summary[self::TITLE] = $url; // stripping html to be on the safe side @@ -2937,9 +2947,7 @@ class Fetcher implements CrawlConstants $this->found_sites[self::INVERTED_INDEX][ $part_num]->addDocumentWords($link_keys, self::NEEDS_OFFSET_FLAG, $link_word_lists, - $link_meta_ids, - PhraseParser::$materialized_metas, false, - $link_rank); + $link_meta_ids, false, $link_rank); } } $interim_elapse = L\changeInMicrotime($interim_time); diff --git a/src/executables/QueryTool.php b/src/executables/QueryTool.php index 0cf09c645..bbc9a0847 100755 --- a/src/executables/QueryTool.php +++ b/src/executables/QueryTool.php @@ -78,7 +78,8 @@ class QueryTool implements CrawlConstants $this->usageMessageAndExit(); } $query = $argv[1]; - $results_per_page = (isset($argv[2])) ? $argv[2] : 10; + $results_per_page = (isset($argv[2])) ? (is_numeric($argv[2]) ? + $argv[2] : 0 ) : 10; $limit = (isset($argv[3])) ? $argv[3] : 0; L\setLocaleObject((isset($argv[4])) ? $argv[4] : C\DEFAULT_LOCALE); $start_time = microtime(true); diff --git a/src/executables/QueueServer.php b/src/executables/QueueServer.php index 692b671cd..5adc90db4 100755 --- a/src/executables/QueueServer.php +++ b/src/executables/QueueServer.php @@ -344,7 +344,7 @@ class QueueServer implements CrawlConstants, Join in_array($argv[3], [self::INDEXER, self::SCHEDULER])) { $this->server_type = $argv[3]; $this->server_name = $argv[3]; - L\crawlLog($argv[3]." logging started."); + L\crawlLog($argv[3] . " logging started."); } $remove = false; $old_message_names = ["QueueServerMessages.txt", @@ -524,9 +524,8 @@ class QueueServer implements CrawlConstants, Join $crawl_params[self::CRAWL_TIME] = $this->crawl_time; $crawl_params[self::CRAWL_TYPE] = $this->crawl_type; $info_string = serialize($crawl_params); - file_put_contents( - C\CRAWL_DIR."/schedules/". $this->process_name . "Messages.txt", - $info_string); + file_put_contents(C\CRAWL_DIR . "/schedules/" . + $this->process_name . "Messages.txt", $info_string); chmod(C\CRAWL_DIR."/schedules/". $this->process_name . "Messages.txt", 0777); } @@ -872,7 +871,6 @@ class QueueServer implements CrawlConstants, Join } $close_file = C\CRAWL_DIR.'/schedules/'.self::index_closed_name. $this->crawl_time.".txt"; - if (!file_exists($close_file) && strcmp($this->server_type, self::BOTH) != 0) { file_put_contents($close_file, "2"); @@ -905,10 +903,9 @@ class QueueServer implements CrawlConstants, Join $crawl_status['CRAWL_TIME'] = $this->crawl_time; $crawl_status['COUNT'] = 0; $crawl_status['DESCRIPTION'] = $message; - file_put_contents( - C\CRAWL_DIR."/schedules/crawl_status.txt", + file_put_contents(C\CRAWL_DIR . "/schedules/crawl_status.txt", serialize($crawl_status)); - chmod(C\CRAWL_DIR."/schedules/crawl_status.txt", 0777); + chmod(C\CRAWL_DIR . "/schedules/crawl_status.txt", 0777); } /** * When a crawl is being shutdown, this function is called to write @@ -931,7 +928,7 @@ class QueueServer implements CrawlConstants, Join return; } L\crawlLog("Writing queue contents back to schedules..."); - $dir = C\CRAWL_DIR."/schedules/".self::schedule_data_base_name. + $dir = C\CRAWL_DIR."/schedules/" . self::schedule_data_base_name . $this->crawl_time; if (!file_exists($dir)) { mkdir($dir); @@ -989,8 +986,9 @@ class QueueServer implements CrawlConstants, Join $data_string = L\webencode( gzcompress(serialize($schedule_data))); $data_hash = L\crawlHash($data_string); - file_put_contents($dir."/At".$schedule_time."From127-0-0-1". - $note_string. "WithHash$data_hash.txt", $data_string); + file_put_contents($dir."/At" . $schedule_time . + "From127-0-0-1". $note_string . + "WithHash$data_hash.txt", $data_string); $data_string = ""; $schedule_data[self::TO_CRAWL] = []; } @@ -1007,7 +1005,7 @@ class QueueServer implements CrawlConstants, Join } else { $schedule_time = $time; } - file_put_contents($dir."/At".$schedule_time."From127-0-0-1". + file_put_contents($dir."/At" . $schedule_time . "From127-0-0-1". $note_string . "WithHash$data_hash.txt", $data_string); } $this->db->setWorldPermissionsRecursive( @@ -1026,7 +1024,7 @@ class QueueServer implements CrawlConstants, Join $this-> index_archive->forceSave(); $this-> - index_archive->addCurrentShardDictionary(); + index_archive->addAdvanceGeneration(); $this->index_archive->dictionary->mergeAllTiers(); } $this->db->setWorldPermissionsRecursive( diff --git a/src/library/CrawlConstants.php b/src/library/CrawlConstants.php index 7a632d132..176fe2242 100755 --- a/src/library/CrawlConstants.php +++ b/src/library/CrawlConstants.php @@ -231,8 +231,7 @@ interface CrawlConstants const CENTROID_WEIGHTED_SUMMARIZER = 'dt'; const SCRAPER_LABEL = 'du'; const SCRAPERS = 'dv'; - const IS_NEWS = "dw"; - const QUESTION_ANSWERS = 'dx'; - const CONTENT_SIZE = 'dy'; - const NO_RANGE = 'dz'; + const QUESTION_ANSWERS = 'dw'; + const CONTENT_SIZE = 'dx'; + const NO_RANGE = 'dy'; } diff --git a/src/library/IndexArchiveBundle.php b/src/library/IndexArchiveBundle.php index d2343cce0..eba02a888 100644 --- a/src/library/IndexArchiveBundle.php +++ b/src/library/IndexArchiveBundle.php @@ -252,6 +252,8 @@ class IndexArchiveBundle implements CrawlConstants public function addAdvanceGeneration($callback = null) { $this->addCurrentShardDictionary($callback); + echo "Resaving active shard without prefix and dictionary etc\n"; + $this->getActiveShard()->saveWithoutDictionary(); //Set up new shard $this->generation_info['ACTIVE']++; $this->generation_info['CURRENT'] = @@ -320,8 +322,7 @@ class IndexArchiveBundle implements CrawlConstants $current_index_shard_file = $this->dir_name . "/posting_doc_shards/index". $this->generation_info['CURRENT']; if (file_exists($current_index_shard_file)) { - if (isset($this->generation_info['DISK_BASED']) && - $this->generation_info['DISK_BASED'] == true) { + if (!empty($this->generation_info['DISK_BASED'])) { $this->current_shard = new IndexShard( $current_index_shard_file, $this->generation_info['CURRENT'], diff --git a/src/library/IndexDictionary.php b/src/library/IndexDictionary.php index af324a20b..2dd219f0e 100644 --- a/src/library/IndexDictionary.php +++ b/src/library/IndexDictionary.php @@ -250,7 +250,7 @@ class IndexDictionary implements CrawlConstants } } // write prefixes - $fh = fopen($this->dir_name."/$i/0".$out_slot.".dic", "wb"); + $fh = fopen($this->dir_name . "/$i/0" . $out_slot . ".dic", "wb"); fwrite($fh, substr($prefix_string, $i * $prefix_header_size, $prefix_header_size)); $j = $num_prefix_letters; @@ -301,7 +301,7 @@ class IndexDictionary implements CrawlConstants for ($i = 0; $i < self::NUM_PREFIX_LETTERS; $i++) { crawlTimeoutLog("..processing first index prefix $i of ". self::NUM_PREFIX_LETTERS." in $tier."); - $this-> mergeTierFiles($i, $tier, $out_slot); + $this->mergeTierFiles($i, $tier, $out_slot); } } /** @@ -322,7 +322,7 @@ class IndexDictionary implements CrawlConstants $prefix_header_size = self::PREFIX_HEADER_SIZE; $fh_a = fopen( $file_a, "rb"); $fh_b = fopen( $file_b, "rb"); - $fh_out = fopen( $this->dir_name."/$prefix/".($tier + 1). + $fh_out = fopen( $this->dir_name . "/$prefix/" . ($tier + 1) . "$out_slot.dic", "wb+"); $prefix_bit = ($prefix & 128) ? 0 : 128; // Scan past prefix headers @@ -530,8 +530,8 @@ class IndexDictionary implements CrawlConstants * @param string $record_b a dictionary record including auxiliary records * from the 'b'th file of the give tier * @param int $prefix_bit either 0 or 32768. The first bit of an auxiliary - * record should be ~higher order bit of the given prefix letter - * used by the tier file. + * record should be negation of higher order bit of the given prefix + * letter used by the tier file. * @return string a single record with merged strings making use of * auxliary records as needed containing * (generation, posting list offset, length) information. @@ -563,7 +563,7 @@ class IndexDictionary implements CrawlConstants $aux_record_len); if (count($aux_records) == 3) { $record .= chr($prefix_bit + ($num_aux_records >> 8)) . - chr($num_aux_records & 255). implode("", $aux_records); + chr($num_aux_records & 255) . implode("", $aux_records); $aux_records = []; $num_aux_records++; } @@ -609,7 +609,7 @@ class IndexDictionary implements CrawlConstants $posting_info = str_split(substr($record_string, $offset + 2, 30), 10); if (!isset($posting_info[2]) ){ crawlLog("Decode Aux Record failed...". - toHexString($record_string)." ".$offset); + toHexString($record_string)." " . $offset); crawlLog(print_r($posting_info, true)); crawlLog(print_r(debug_backtrace(), true)); exit(); @@ -716,9 +716,6 @@ class IndexDictionary implements CrawlConstants * @param bool $raw whether the id is our version of base64 encoded or not * @param int $shift how many low order bits to drop from $word_id's * when checking for a match - * @param string $mask bit mask to be applied to bytes after the 8th - * byte through 20th byte of word_id. In single word case these - * bytes contain safe:, media:, and class: meta word info * @param int $threshold if greater than zero how many posting list * results in dictionary info returned before stopping looking for * more matches @@ -728,7 +725,7 @@ class IndexDictionary implements CrawlConstants * @return mixed an array of entries of the form * generation, first offset, last offset, count */ - public function getWordInfo($word_id, $raw = false, $shift = 0, $mask = "", + public function getWordInfo($word_id, $raw = false, $shift = 0, $threshold = -1, $start_generation = -1, $num_distinct_generations = -1, $with_remaining_total = false) { @@ -738,8 +735,7 @@ class IndexDictionary implements CrawlConstants $current_max_generation = -2; foreach ($this->active_tiers as $tier) { $tier_info = $this->getWordInfoTier($word_id, $raw, $tier, $shift, - $mask, $threshold, $start_generation, - $num_distinct_generations); + $threshold, $start_generation, $num_distinct_generations); if (is_array($tier_info) && isset($tier_info[2]) && is_array($tier_info[2])) { list($found_count, $max_found_generation, @@ -790,9 +786,6 @@ class IndexDictionary implements CrawlConstants * @param int $tier which tier to get word info from * @param int $shift how many low order bits to drop from $word_id's * when checking for a match - * @param string $mask bit mask to be applied to bytes after the 8th - * byte through 20th byte of word_id. In single word case these - * bytes contain safe:, media:, and class: meta word info * @param int $threshold if greater than zero how many posting list * results in dictionary info returned before stopping looking for * more matches @@ -807,8 +800,7 @@ class IndexDictionary implements CrawlConstants * no data */ public function getWordInfoTier($word_id, $raw, $tier, $shift = 0, - $mask = "", $threshold = -1, $start_generation = -1, - $num_distinct_generations = -1) + $threshold = -1, $start_generation = -1, $num_distinct_generations = -1) { $num_generations = 0; $max_retained_generation = -1; @@ -827,17 +819,12 @@ class IndexDictionary implements CrawlConstants if (strlen($word_id) < 1) { return false; } - if ($mask != "") { - $mask_len = min(11, strlen($mask)); - } else { - $mask_len = 0; - } $word_item_len = $word_key_len + IndexShard::WORD_DATA_LEN; $word_data_len = IndexShard::WORD_DATA_LEN; $file_num = ord($word_id[0]); /* Entries for a particular shard have postings for both - docs and links. If an entry has more than max_entry_len + docs and links. If an entry has more than max_entry_count we will assume entry somehow got corrupted and skip that generation for that word. Because we are including link have set threshold to 5 * number of docs that could be in a shard @@ -912,7 +899,7 @@ class IndexDictionary implements CrawlConstants $id_info = []; $num_aux_records = (ord($word_string[$word_key_len]) << 8) + ord($word_string[$word_key_len + 1]); - $word_string = "\x00\x00".substr($word_string, $word_key_len + 2); + $word_string = "\x00\x00" . substr($word_string, $word_key_len + 2); $tmp = IndexShard::getWordInfoFromString($word_string, true); $check_and_auxes = 1; if ($tmp[3] < $max_entry_count) { @@ -920,10 +907,11 @@ class IndexDictionary implements CrawlConstants $previous_id = $id; $remember_generation = $previous_generation; if ($start_generation <= $previous_generation) { - if ($this->checkMaskAndAdd($id, $word_id, $mask, $mask_len, + $this->addLookedUpEntry($id, $word_id, $tmp, $info, $total_count, $previous_generation, $previous_id, $num_generations, $num_distinct_generations, - $max_retained_generation, $id_info) && $num_aux_records>0) { + $max_retained_generation, $id_info); + if ($num_aux_records > 0) { $this->addAuxInfoRecords($id ,$file_num, $num_aux_records, $total_count, $threshold, $info, $previous_generation, $num_generations, $start + @@ -946,7 +934,7 @@ class IndexDictionary implements CrawlConstants single records get corrupted. */ $break_count = 0; - /* we found one match so far (ignoring mask), we are now backing up + /* we found one match so far, we are now backing up to look for earlier matches */ while ($test_loc >= $low) { @@ -997,11 +985,12 @@ class IndexDictionary implements CrawlConstants $num_generations < $num_distinct_generations || $current_generation <= $max_retained_generation )) { - if ($this->checkMaskAndAdd($id, $word_id, $mask, $mask_len, + $this->addLookedUpEntry($id, $word_id, $tmp, $info, $total_count, $previous_generation, $previous_id, $num_generations, $num_distinct_generations, $max_retained_generation, - $id_info) && $num_aux_records > 0) { + $id_info); + if ($num_aux_records > 0) { $this->addAuxInfoRecords($id, $file_num, $num_aux_records, $total_count, $threshold, $info, $previous_generation, $num_generations, $start + @@ -1020,7 +1009,7 @@ class IndexDictionary implements CrawlConstants $test_loc = $check_loc + $check_and_auxes; $previous_generation = $remember_generation; $break_count = 0; - /* from the first match we found (ignoring mask), we are now looking + /* from the first match we found, we are now looking forward to find matches */ while ($test_loc <= $high) { @@ -1050,11 +1039,11 @@ class IndexDictionary implements CrawlConstants $num_generations < $num_distinct_generations || $current_generation <= $max_retained_generation )) { - if ($this->checkMaskAndAdd($id, $word_id, $mask, $mask_len, - $tmp, $info, $total_count, $previous_generation, - $previous_id, $num_generations, - $num_distinct_generations, $max_retained_generation, - $id_info) && $num_aux_records > 0) { + $this->addLookedUpEntry($id, $word_id, $tmp, $info, + $total_count, $previous_generation, $previous_id, + $num_generations, $num_distinct_generations, + $max_retained_generation, $id_info); + if ($num_aux_records > 0) { $this->addAuxInfoRecords($id, $file_num, $num_aux_records, $total_count, $threshold, $info, $previous_generation, $num_generations, $start + @@ -1078,7 +1067,8 @@ class IndexDictionary implements CrawlConstants * a given word id can't be stored in a single record * * @param string $id word id to add aux records for - * @param int $file_num + * @param int $file_num which prefix file to read from (always reads + * a file at the max_tier level) * @param int $num_aux_records * @param int& $total_count * @param int $threshold @@ -1129,7 +1119,9 @@ class IndexDictionary implements CrawlConstants $id_info[$record[0]][] = count($info); $info[] = $record; $total_count += $record[3]; - if ($threshold > 0 && $total_count > $threshold) { return; } + if ($threshold > 0 && $total_count > $threshold) { + return; + } $previous_generation = $record[0]; } } @@ -1155,18 +1147,14 @@ class IndexDictionary implements CrawlConstants /** * This method is used when computing the array of * (generation, posting_list_start, len, exact_word_id) quadruples when - * looking up a $word_id in an index dictionary. It checks - * if the $id of a dictionary row matches $word_id up to the $mask info. - * If so, it adds the word record to the quadruple array $info that has been + * looking up a $word_id in an index dictionary. It adds the + * word record to the quadruple array $info that has been * calculated so far. It also update $total_count, and as well as * $previous info for the previous matching record. * * @param string $id of a row to compare $word_id against * @param string $word_id the word id of a term or phrase we are computing * the quadruple array for - * @param string $mask up to 9 byte wask used to say which materialized - * meta words should be checked for when doing a match - * @param int $mask_len this should be strlen($mask) * @param array $record current record from dictionary that we may or may * not add to info * @param array& $info quadruple array we are adding to @@ -1177,61 +1165,40 @@ class IndexDictionary implements CrawlConstants * @param int $num_distinct_generations * @param int& $max_retained_generation * @param array& $id_info - * @return bool whether the record was added */ - public function checkMaskAndAdd($id, $word_id, $mask, $mask_len, $record, + public function addLookedUpEntry($id, $word_id, $record, &$info, &$total_count, &$previous_generation, &$previous_id, &$num_generations, $num_distinct_generations, &$max_retained_generation, &$id_info) { $record[4] = $id; - $add_flag = true; - if ($mask != "" && strlen($id) > 9 && strlen($word_id) > 9 && - substr_compare($id, $word_id, 9, $mask_len) != 0) { - $k = 0; - $old_k = 0; - while(($k = strpos($mask, "\xFF", $old_k)) !== false) { - $loc = $k + 8; - if (isset($id[$loc]) && $id[$loc] != $word_id[$loc]) { - $add_flag = false; - break; - } - if ($k == $old_k) { - $k++; - } - $old_k = $k; - } - } - if ($add_flag) { //adding to the end is front is slower than tacking to end - if ($num_distinct_generations > 0) { - if (!isset($id_info[$record[0]])) { - $id_info[$record[0]] = []; - if ($num_generations >= $num_distinct_generations) { - if (isset($id_info[$max_retained_generation])) { - foreach ($id_info[$max_retained_generation] as - $key) { - $total_count -= $info[$key][3]; - $info[$key] = false; - } - unset($id_info[$max_retained_generation]); - } - $max_retained_generation = max(array_keys($id_info)); - } else { - $num_generations++; - if ($record[0] > $max_retained_generation) { - $max_retained_generation = $record[0]; + if ($num_distinct_generations > 0) { + if (!isset($id_info[$record[0]])) { + $id_info[$record[0]] = []; + if ($num_generations >= $num_distinct_generations) { + if (isset($id_info[$max_retained_generation])) { + foreach ($id_info[$max_retained_generation] as + $key) { + $total_count -= $info[$key][3]; + $info[$key] = false; } + unset($id_info[$max_retained_generation]); + } + $max_retained_generation = max(array_keys($id_info)); + } else { + $num_generations++; + if ($record[0] > $max_retained_generation) { + $max_retained_generation = $record[0]; } } - $id_info[$record[0]][] = count($info); } - $info[] = $record; - $total_count += $record[3]; - $previous_generation = $record[0]; - $previous_id = $id; } - return $add_flag; + $id_info[$record[0]][] = count($info); + $info[] = $record; + $total_count += $record[3]; + $previous_generation = $record[0]; + $previous_id = $id; } /** * Gets from disk $len many bytes beginning at $offset from the @@ -1302,4 +1269,4 @@ class IndexDictionary implements CrawlConstants $this->fhs[$file_num][$tier], self::DICT_BLOCK_SIZE); return $this->blocks[$file_num][$tier][$bytes]; } -} \ No newline at end of file +} diff --git a/src/library/IndexManager.php b/src/library/IndexManager.php index af6d718ae..83b710377 100644 --- a/src/library/IndexManager.php +++ b/src/library/IndexManager.php @@ -149,8 +149,6 @@ class IndexManager implements CrawlConstants * dictionary * @param int $shift if $hash is for a phrase, how many low order * bits of word id to discard - * @param string $mask if $hash is for a word, after the 9th byte what - * meta word mask should be applied to the 20 byte hash * @param int $threshold after the number of results exceeds this amount * stop looking for more dictionary entries. * @param int $start_generation @@ -161,28 +159,18 @@ class IndexManager implements CrawlConstants * that match $hash) */ public static function getWordInfo($index_name, $hash, $shift = 0, - $mask = "", $threshold = -1, $start_generation = -1, - $num_distinct_generations = -1, $with_remaining_total = false) + $threshold = -1, $start_generation = -1, $num_distinct_generations = -1, + $with_remaining_total = false) { $id = "$index_name:$start_generation:$num_distinct_generations"; $index = self::getIndex($index_name); - $len = strlen($mask); - if ($len > 0) { - $pre_hash = substr($hash, 0, 8) . - "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"; - } else { - $pre_hash = $hash; - } $tmp = []; - $test_mask = ""; if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS) && $start_generation < 0 && file_exists(C\WORK_DIRECTORY . "/feeds/index")) { - //NO_FEEDS defined true in statistic_controller.php $use_feeds = true; $feed_shard = self::getIndex("feed"); - $feed_info = $feed_shard->getWordInfo($hash, true, $shift, - $mask); + $feed_info = $feed_shard->getWordInfo($hash, true, $shift); if (is_array($feed_info)) { $tmp[-1] = [-1, $feed_info[0], $feed_info[1], $feed_info[2], $feed_info[3]]; @@ -191,7 +179,7 @@ class IndexManager implements CrawlConstants if (!empty($index->dictionary)) { $pre_info = $index->dictionary->getWordInfo($hash, true, $shift, - $mask, $threshold, $start_generation, + $threshold, $start_generation, $num_distinct_generations, true); } if (!empty($pre_info[1])) { @@ -230,22 +218,12 @@ class IndexManager implements CrawlConstants } $pos = -1; $total_num_docs = 0; - $hashes = allCrawlHashPaths($term_or_phrase, [], [], true); - if (!is_array($hashes)) { - $hashes = [$hashes]; - } + $hashes = allCrawlHashPaths($term_or_phrase, true); foreach ($hashes as $hash) { - if (is_array($hash)) { - list($num_docs, ) = - self::getWordInfo($index_name, $hash[0], - $hash[1], $hash[2], $threshold, $start_generation, - $num_distinct_generations, true); - } else { - list($num_docs, ) = - self::getWordInfo($index_name, $hash, 0, "", - $threshold, $start_generation, $num_distinct_generations, - true); - } + list($num_docs, ) = + self::getWordInfo($index_name, $hash[0], + $hash[1], $threshold, $start_generation, + $num_distinct_generations, true); $total_num_docs += $num_docs; if ($threshold > 0 && $total_num_docs > $threshold) { return $total_num_docs; diff --git a/src/library/IndexShard.php b/src/library/IndexShard.php index 0b24c8df7..4dcedaa7b 100644 --- a/src/library/IndexShard.php +++ b/src/library/IndexShard.php @@ -323,7 +323,6 @@ class IndexShard extends PersistentStructure implements * @param array $word_lists (word => array of word positions in doc) * @param array $meta_ids meta words to be associated with the document * an example meta word would be filetype:pdf for a PDF document. - * @param array $materialized_metas * @param bool $is_doc flag used to indicate if what is being sored is * a document or a link to a document * @param mixed $rank either false if not used, or a 4 bit estimate of the @@ -331,8 +330,7 @@ class IndexShard extends PersistentStructure implements * @return bool success or failure of performing the add */ public function addDocumentWords($doc_keys, $summary_offset, $word_lists, - $meta_ids = [], $materialized_metas = [], $is_doc = false, - $rank = false) + $meta_ids = [], $is_doc = false, $rank = false) { if ($this->word_docs_packed == true) { $this->words = []; @@ -343,7 +341,9 @@ class IndexShard extends PersistentStructure implements $link_doc_len = 0; $len_key = strlen($doc_keys); $num_keys = floor($len_key/self::DOC_KEY_LEN); - if ($num_keys * self::DOC_KEY_LEN != $len_key) { return false; } + if ($num_keys * self::DOC_KEY_LEN != $len_key) { + return false; + } if ($num_keys % 2 == 0 ) { $doc_keys .= self::BLANK; //want to keep docids_len divisible by 16 } @@ -358,17 +358,16 @@ class IndexShard extends PersistentStructure implements foreach ($meta_ids as $meta_id) { $word_lists[$meta_id] = []; } - $meta_string = encodeMaterialMetas($meta_ids, $materialized_metas); //using $this->docids_len divisible by 16 $doc_offset = $this->docids_len >> 4; foreach ($word_lists as $word => $position_list) { $occurrences = count($position_list); if (isset($position_list["cond_max"])) { //for now - $word_id = crawlHashPath($word, - $position_list["cond_max"], [], [], true); + $word_id = crawlHashPath($word, $position_list["cond_max"], + true); unset($position_list["cond_max"]); } else { - $word_id = crawlHashWord($word, true, $meta_string); + $word_id = crawlHashWord($word, true); } $store = packPosting($doc_offset, $position_list); if (!isset($this->words[$word_id])) { @@ -417,12 +416,10 @@ class IndexShard extends PersistentStructure implements * @param bool $raw whether the id is our version of base64 encoded or not * @param int $shift how many low order bits to drop from $word_id's * when checking for a match - * @param string $mask if $hash is for a word, after the 9th byte what - * meta word mask should be applied to the 20 byte hash * @return array first offset, last offset, count, exact matching id ( * recall match can ignore low order shift bits) */ - public function getWordInfo($word_id, $raw = false, $shift = 0, $mask = "") + public function getWordInfo($word_id, $raw = false, $shift = 0) { if ($raw == false) { //get rid of out modified base64 encoding @@ -431,7 +428,6 @@ class IndexShard extends PersistentStructure implements $is_disk = $this->read_only_from_disk; $word_item_len = self::WORD_KEY_LEN + self::WORD_DATA_LEN; $word_key_len = self::WORD_KEY_LEN; - $mask_len = strlen($mask); if ($is_disk) { $this->getShardHeader(); if (!isset($word_id[1])) { @@ -465,48 +461,6 @@ class IndexShard extends PersistentStructure implements $id = substr($word_string, 0, $word_key_len); $cmp = compareWordHashes($word_id, $id, $shift); if ($cmp === 0) { - $found = false; - $orig_id = $id; - $old_check_loc = $check_loc; - while (compareWordHashes($word_id, $id, $shift) == 0 && - $check_loc >= $low) { - if ($check_loc != $old_check_loc) { - $word_string = $this->getWordString($is_disk, $start, - $check_loc, $word_item_len); - if ($word_string == false) { - break; - } - $id = substr($word_string, 0, $word_key_len); - } - if (matchingWordMetas($word_id, $id, $mask, $mask_len)) { - $found = true; - break; - } - $check_loc--; - } - $check_loc = $old_check_loc; - $id = $orig_id; - if (!$found) { - while (compareWordHashes($word_id, $id, $shift) == 0 && - $check_loc <= $high) { - if ($check_loc != $old_check_loc) { - $word_string = $this->getWordString($is_disk, - $start, $check_loc, $word_item_len); - if ($word_string == false) { - break; - } - $id = substr($word_string, 0, $word_key_len); - } - if (matchingWordMetas($word_id, $id, $mask,$mask_len)) { - $found = true; - break; - } - $check_loc++; - } - } - if (!$found) { - return false; - } $tmp_info = $this->getWordInfoFromString( substr($word_string, $word_key_len)); $tmp_info[] = $id; @@ -668,7 +622,8 @@ class IndexShard extends PersistentStructure implements } $item[self::DOC_LEN] = $doc_len; $item[self::IS_DOC] = $is_doc; - $item[self::PROXIMITY]=$this->computeProximity($position_list, $is_doc); + $item[self::PROXIMITY] = + $this->computeProximity($position_list, $is_doc); $occurrences = $this->weightedCount($position_list, $is_doc); //override $occurrences if $occurs != 0 if ($occurs != 0) { @@ -776,6 +731,9 @@ class IndexShard extends PersistentStructure implements self::TITLE => 0, self::DESCRIPTION => 0, self::LINKS => 0]; + if (!is_array($position_list)) { + return $count; + } foreach ($position_list as $position) { if ($is_doc) { if ($position < C\AD_HOC_TITLE_LENGTH) { @@ -1289,10 +1247,10 @@ class IndexShard extends PersistentStructure implements crawlLog("Saving index shard .. done merge postings to string"); } $this->prepareWordsAndPrefixes($with_logging); - if ($with_logging) { - crawlLog("Saving index shard .. make prefixes"); - } - $header = pack("N*", $this->prefixes_len , + if ($with_logging) { + crawlLog("Saving index shard .. make prefixes"); + } + $header = pack("N*", $this->prefixes_len, $this->words_len, $this->word_docs_len, $this->docids_len, @@ -1337,6 +1295,31 @@ class IndexShard extends PersistentStructure implements $this->word_docs_packed = false; return $out; } + /** + * This method re-saves a saved shard without the prefixes and dictionary. + * It would typically be called after this information has been stored + * in an IndexDictionary obbject so that the data is not redundantly stored + */ + public function saveWithoutDictionary() + { + $this->getShardHeader(); + $header = pack("N*", 0, 0, + $this->word_docs_len, + $this->docids_len, + $this->generation, + $this->num_docs_per_generation, + $this->num_docs, + $this->num_link_docs, + $this->len_all_docs, + $this->len_all_link_docs); + $word_docs = $this->getWordDocsSubstring(); + $doc_infos = $this->getDocInfoSubstring(); + $fh = fopen($this->filename, "wb"); + fwrite($fh, $header); + fwrite($fh, $word_docs); + fwrite($fh, $doc_infos); + fclose($fh); + } /** * Computes the prefix string index for the current words array. * This index gives offsets of the first occurrences of the lead two char's @@ -1394,7 +1377,7 @@ class IndexShard extends PersistentStructure implements /** * Posting lists are initially stored associated with a word as a key * value pair. The merge operation then merges them these to a string - * help by word_postings. packWords separates words from postings. + * by word_postings. packWords separates words from postings. * After being applied words is a string consisting of * triples (as concatenated strings) word_id, start_offset, end_offset. * The offsets refer to integers offsets into a string $this->word_docs @@ -1504,7 +1487,6 @@ class IndexShard extends PersistentStructure implements $postings = substr($this->word_postings, $pos + $key_len + $posting_len, $len); $pos += $key_len + $posting_len + $len; - if ($len != $two_doc_len || strncmp($postings, self::HALF_BLANK, self::POSTING_LEN) != 0) { if ($fh != null) { @@ -1582,8 +1564,11 @@ class IndexShard extends PersistentStructure implements * @param $len number of bytes to get * @return desired string */ - public function getWordDocsSubstring($offset, $len) + public function getWordDocsSubstring($offset = 0, $len = 0) { + if ($len <= 0) { + $len = $this->word_docs_len; + } if ($this->read_only_from_disk) { return $this->getShardSubstring($this->word_doc_offset + $offset, $len); @@ -1611,8 +1596,11 @@ class IndexShard extends PersistentStructure implements * @param $len number of bytes to get * @return desired string */ - public function getDocInfoSubstring($offset, $len) + public function getDocInfoSubstring($offset = 0, $len = 0) { + if ($len <= 0) { + $len = $this->docids_len; + } if ($this->read_only_from_disk) { return $this->getShardSubstring( $this->doc_info_offset + $offset, $len, false); @@ -1870,4 +1858,4 @@ class IndexShard extends PersistentStructure implements substr($value, self::WORD_KEY_LEN, self::WORD_DATA_LEN); } -} \ No newline at end of file +} diff --git a/src/library/PhraseParser.php b/src/library/PhraseParser.php index 46efbdaa8..14eceb76a 100755 --- a/src/library/PhraseParser.php +++ b/src/library/PhraseParser.php @@ -60,11 +60,6 @@ class PhraseParser 'path:', 'robot:', 'safe:', 'server:', 'site:', 'size:', 'time:', 'u:', 'version:','weight:', 'w:' ]; - /** - * Those meta words whose values will be encoded as part of word_ids - * @var array - */ - public static $materialized_metas = ["class:", "media:", "safe:"]; /** * A list of meta words that might be extracted from a query * @var array @@ -1076,57 +1071,6 @@ class PhraseParser $link_meta_ids[] = "link:all"; return $link_meta_ids; } - /** - * Given the word key of a term (a hash of the term string which may - * have materialized meta information such as media type encoded in it), - * compute the media type. - * @param string $word_key hash of term with encoded metas - * @return string what media type it is such as Text, Image, News, Video - * if it can be determined and unknown otherwise. - */ - public static function getMediaType($word_key) - { - if (strlen($word_key) < 10) { - return "unknown"; - } - $media_char = $word_key[9]; - $media_types = ["media:text" => "Text", "media:image" => "Image", - "media:video" => "Video", "media:news" => "News"]; - foreach ($media_types as $type => $common_name) { - $material_meta_string = encodeMaterialMetas([$type], - PhraseParser::$materialized_metas); - if ($material_meta_string[0] == $media_char) { - return $common_name; - } - } - return "Unknown"; - } - /** - * Given the word key of a term (a hash of the term string which may - * have materialized meta information such as safe (not X-rated) search - * info encoded in it), compute the safe value. - * @param string $word_key hash of term with encoded metas - * @return string whether the term is associated with a "safe" page - * in which case the string "True" is returned; an "unsafe" page - * in which case the string "False" is returned; or "Undefined" - * if it cannot be determined from the word key - */ - public static function getSafety($word_key) - { - if (strlen($word_key) < 11) { - return "unknown"; - } - $safety_char = $word_key[10]; - $safety_types = ["safe:true" => "True", "safe:false" => "False"]; - foreach ($safety_types as $type => $common_name) { - $material_meta_string = encodeMaterialMetas([$type], - PhraseParser::$materialized_metas); - if ($material_meta_string[1] == $safety_char) { - return $common_name; - } - } - return "Undefined"; - } /** * Computes the Cosine-similarity of two phrases * diff --git a/src/library/Thesaurus.php b/src/library/Thesaurus.php deleted file mode 100644 index 77dd37858..000000000 --- a/src/library/Thesaurus.php +++ /dev/null @@ -1,361 +0,0 @@ -<?php -/** - * SeekQuarry/Yioop -- - * Open Source Pure PHP Search Engine, Crawler, and Indexer - * - * Copyright (C) 2009 - 2018 Chris Pollett chris@pollett.org - * - * LICENSE: - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <https://www.gnu.org/licenses/>. - * - * END LICENSE - * - * @author Shailesh Padave shaileshpadave49@gmail.com - * @license https://www.gnu.org/licenses/ GPL3 - * @link https://www.seekquarry.com/ - * @copyright 2009 - 2018 - * @filesource - */ -namespace seekquarry\yioop\library; - -use seekquarry\yioop\configs as C; - -/** For Yioop global defines */ -require_once __DIR__."/../configs/Config.php"; -/** - * Class used to reorder the last 10 links computed by PhraseModel based on - * thesaurus semantic information. For English, thesaurus semantic information - * can be provided by WordNet, a lexical English database - * available at http://wordnet.princeton.edu/ - * To enable, you this have to define WORDNET_EXEC in your local_config file. - * The idea behind thresaurus reordering is that given a query, it - * is tagged for parts of speech. Each term is then looked up in thesaurus for - * those parts of speech. Representative phrases for those term senses are - * extracted from the ranked thesaurus output and a set of rewrites of the - * original query are created. By looking up the number - * of times these rewrites occur in the searched index the top two phrases - * that represent the original query are computed.The BM25 similarity of these - * phrases is then scored against each of the 10 output summaries of - * PhraseModel and used to reorder the results. - * To add thesaurus reordering for a different locale, two methods need to be - * written in that locale tokenizer.php file - * tagPartsOfSpeechPhrase($phrase) which on an input phrase return a string - * where each term_i in the phrase has been replace with term_i~pos - * where pos is a two character part of speech NN, VB, AJ, AV, or NA (if - * none of the previous apply) - * scoredThesaurusMatches($term, $word_type, $whole_query) which takes - * a term from an original whole_query which has been tagged to be - * one of the types VB (for verb), NN (for noun), AJ (for adjective), - * AV (for adverb), or NA (for anything else), it outputs - * a sequence of (score => array of thesaurus terms) associations. - * The score representing one word sense of term - * Given that these methods have been implemented if the use_thesaurus field - * of that language tokenizer is set to true, the thesaurus will be used. - */ -class Thesaurus -{ - /** - * Extracts similar phrases to the input query using thesaurus results. - * Part of speech tagging is processed on input and the output is - * looked up in the thesaurus. USing this a ranked list of alternate - * query phrases is created. - * For those phrases, counts in the Yioop index are calculated - * and the top two phrases are selected. - * @param string $orig_query input query from user - * @param string $index_name selected index for search engine - * @param string $lang locale tag for the query - * @param integer $threshold once count in posting list for any word - * reaches to threshold then return the number - * @return array of top two words - */ - public static function getSimilarPhrases($orig_query, $index_name, - $lang, $threshold = 10) - { - $num_docs = []; - $scores = []; - - $suggested_queries = - self::getInitialSuggestions($orig_query, $lang); - foreach ($suggested_queries as $suggestion) { - $num_docs[$suggestion] = - self::numDocsIndex($suggestion, $threshold, $index_name, $lang); - } - arsort($num_docs); - $result = []; - $i = 0; - foreach ($num_docs as $k => $v) { - $result[$i] = $k; - $i++; - if ($i >= 2) { break; } - } - return $result; - } - /** - * Gets array of BM25 scores for given input array of summaries - * and thesaurus generated queries - * @param array $similar_phrases an array of thesaurus generated queries - * @param array $summaries an array of summaries which is generated - * during crawl time. - * @return array of BM25 score for each document based on the thesaurus - * simimar phrases - */ - public static function scorePhrasesSummaries($similar_phrases, $summaries) - { - $score = []; - //if there are no similar words then - if (empty($similar_phrases)) { - return []; - } else { - $num_phrases = count($similar_phrases); - for ($i = 0; $i < $num_phrases; $i++) { - $phrase = $similar_phrases[$i]; - $terms = explode(' ', $phrase); - $summaries = self::changeCaseOfStringArray($summaries); - $idf = self::calculateIDF($summaries, $terms); - $tf = self::calculateTFBM25($summaries, $terms); - $num_summaries = count($summaries); - $num_terms = count($terms); - $bm25_result[$i] = - self::calculateBM25($idf, $tf, $num_terms, $num_summaries); - } - if (count($bm25_result) == 1) { - for ($i = 0; $i < $num_summaries; $i++) { - $temp = 0; - $temp = $bm25_result[0][$i]; - $score[$i] = $temp; - } - } else { - for ($i = 0; $i < $num_summaries; $i++) { - $temp = 0; - $temp = $bm25_result[0][$i] * (2/3) + - $bm25_result[1][$i] * (1/3); - $score[$i] = $temp; - } - } - return $score; - } - } - /** - * Computes suggested related phrases from thesaurus based on part of - * speech done on each query term. - * - * @param string $query query entered by user - * @param string $lang locale tag for the query - * @return string array $suggestion consisting of phrases suggested to - * be similar in meaning to some sens of the query - */ - public static function getInitialSuggestions($query, $lang) - { - $tokenizer = PhraseParser::getTokenizer($lang); - $pos_query = $tokenizer->tagPartsOfSpeechPhrase($query); - $max_len = 25; - $replacement_phrases = []; - $suggestions = []; - $terms = preg_split("/\s+|\-/", trim($query)); - $pos_terms = preg_split("/\s+/", - trim($pos_query), -1, PREG_SPLIT_NO_EMPTY); - $num_pos_terms = count($pos_terms); - $word_type = null; - $similar_words = []; - $known_word_types = ["NN", "VB", "AJ", "AV"]; - for ($i = 0; $i < $num_pos_terms; $i++) { - $pos = strpos($pos_terms[$i], '~'); - $word_type = trim(substr($pos_terms[$i], $pos + 1)); - if (!in_array($word_type, $known_word_types)) { - $word_type = "NA"; - } - $current_word = substr($pos_terms[$i], 0, $pos); - if ($word_type != "NA") { - $similar_phrases = $tokenizer->scoredThesaurusMatches( - $current_word, $word_type, $query); - $highest_scoring_sense_phrases = ($similar_phrases) ? - array_shift($similar_phrases): false; - if ($highest_scoring_sense_phrases) { - $replacement_phrases[$current_word] = - $highest_scoring_sense_phrases; - } - } - } - $i = 0; - foreach ($replacement_phrases as $words => $similar_phrases) { - foreach ($similar_phrases as $phrase) { - if (mb_strpos(trim($phrase), ' ') !== false) { - $phrase = preg_replace('/~[\w]+/', '', $phrase); - } - $modified_query = preg_replace( - '/' . $words . '/', trim($phrase), $query); - if (mb_strlen($modified_query) < $max_len && - mb_strpos($modified_query, $query) === false) { - $suggestions[$i] = $modified_query; - $i++; - } - } - } - return $suggestions; - } - /** - * Returns the number of documents in an index that a phrase occurs in. - * If it occurs in more than threshold documents then cut off search. - * - * @param string $phrase to look up in index - * @param int $threshold once count in posting list for any word - * reaches to threshold then return the number - * @param string $index_name selected index for search engine - * @param string $lang locale tag for the query - * @return int number of documents phrase occurs in - */ - public static function numDocsIndex($phrase, $threshold, $index_name, $lang) - { - PhraseParser::canonicalizePunctuatedTerms($phrase, $lang); - $terms = PhraseParser::stemCharGramSegment($phrase, $lang); - $num = count($terms); - if ($index_name == null) { - return 0; - } - if (count($terms) > C\MAX_QUERY_TERMS) { - $terms = array_slice($terms, 0, C\MAX_QUERY_TERMS); - } - $whole_phrase = implode(" ", $terms); - return IndexManager::numDocsTerm($whole_phrase, $index_name, - $threshold); - } - /** - * Lower cases an array of strings - * - * @param array $summaries strings to put into lower case - * @return array with strings converted to lower case - */ - public static function changeCaseOfStringArray($summaries) - { - return explode("-!-", mb_strtolower(implode("-!-", $summaries))); - } - /** - * Computes the BM25 of an array of documents given that the idf and - * tf scores for these documents have already been computed - * - * @param array $idf inverse doc frequency for given query array - * @param array $tf term frequency for given query array - * @param $num_terms number of terms that make up input query - * @param $num_summaries count for input summaries - * @returns array consisting of BM25 scores for each document - */ - public static function calculateBM25($idf, $tf, $num_terms, $num_summaries) - { - $scores = []; - for ($i = 0; $i < $num_terms; $i++) { - for ($j = 0; $j < $num_summaries; $j++) { - $bm25_score[$i][$j] = $idf[$i] * $tf[$i][$j]; - } - } - for ($i = 0; $i < $num_summaries; $i++) { - $val = 0; - for ($j = 0; $j < $num_terms; $j++) { - $val += $bm25_score[$j][$i]; - } - $scores[$i] = $val; - } - return $scores; - } - /** - * Calculates the BM25 normalized term frequency of a set of terms in - * a collection of text summaries - * - * @param array $summaries list of summary strings to compute BM25TF w.r.t - * @param array $terms we want the term frequency computation for - * @return array $tfbm25 a 2d array with rows being indexed by terms and - * columns indexed by summaries and the values of an entry being - * the tfbm25 score for that term in that document - */ - public static function calculateTFBM25($summaries, $terms) - { - $k1 = 1.5; - $b = 0.75; - $tf_values = []; - $tfbm25 = []; - $doc_length = strlen(implode("", $summaries)); - $num_summaries = count($summaries); - if ($num_summaries!= 0) { - $avg_length = $doc_length / $num_summaries; - } else { - $avg_length = 0; - } - $avg_length = max($avg_length, 1); - $tf_values = self::calculateTermFreq($summaries, $terms); - $num_terms =count($terms); - for ($i = 0; $i < $num_terms; $i++) { - for ($j = 0; $j < $num_summaries; $j++) { - $frequency = $tf_values[$i][$j]; - $tfbm25[$i][$j] = - ($frequency * ($k1 + 1))/($frequency + $k1 * - ((1 - $b) + $b * ($doc_length/$avg_length))); - } - } - return $tfbm25; - } - /** - * Computes a 2D array of the number of occurences of term i in document j - * - * @param array $summaries documents to compute frequencies in - * @param array $terms terms to compute frequencies for - * @return array 2D array as described above - */ - public static function calculateTermFreq($summaries, $terms) - { - $tf_values = []; - $num_terms = count($terms); - $num_summaries = count($summaries); - for ($i = 0; $i < $num_terms; $i++) { - for ($j = 0; $j < $num_summaries; $j++) { - if ($terms[$i] != "") { - $frequency = substr_count($summaries[$j], $terms[$i]); - $tf_values[$i][$j] = $frequency; - } else { - $tf_values[$i][$j] = 0; - } - } - } - return $tf_values; - } - /** - * To get the inverse document frequencies for a collection of terms in - * a set of documents. - * IDF(term_i) = log_10(# of document / # docs term i in) - * - * @param array $summaries documents to use in calculating IDF score - * @param array $terms terms to compute IDF score for - * @return array $idf 1D-array saying the inverse document frequency for - * each term - */ - public static function calculateIDF($summaries, $terms) - { - $N = count($summaries); - $Nt = []; - $term_count = 0; - $num_terms = count($terms); - for ($i = 0; $i < $num_terms; $i++) { - $cnt_Nt = 0; - $term_count++; - foreach ($summaries as $summary) - { - if (stripos($summary, $terms[$i]) !== false) { - $cnt_Nt++; - } - } - $Nt[$i] = $cnt_Nt; - $idf[$i] = ($Nt[$i] != 0) ? log10($N / $Nt[$i]) : 0; - } - return $idf; - } -} diff --git a/src/library/Utility.php b/src/library/Utility.php index aadb01782..2d7fe1807 100755 --- a/src/library/Utility.php +++ b/src/library/Utility.php @@ -36,7 +36,7 @@ namespace seekquarry\yioop\library; use seekquarry\yioop\configs as C; /** For Yioop global defines */ -require_once __DIR__."/../configs/Config.php"; +require_once __DIR__ . "/../configs/Config.php"; /** * Adds delimiters to a regex that may or may not have them * @@ -50,8 +50,8 @@ function addRegexDelimiters($expression) $last = $expression[$len - 1]; if (($first != $last && $len > 1) || $len == 1) { $expression = ($first != '/' ) ? - "/".$expression."/" - : "@".$expression."@"; + "/" . $expression . "/" + : "@" . $expression . "@"; } return $expression; } @@ -233,7 +233,6 @@ function vByteDecode(&$str, &$offset) $pos_int += (ord($str[$offset] & 127) << $shift); $shift += 7; } - return $pos_int; } /** @@ -285,7 +284,7 @@ function packPosting($doc_index, $position_list, $delta = true) */ function unpackPosting($posting, &$offset, $dedelta = true) { - $delta_list = decodeModified9($posting, $offset); + $delta_list = (array) decodeModified9($posting, $offset); $doc_index = array_shift($delta_list); if (($doc_index & (2 << 26)) > 0) { $delta0 = ($doc_index & ((2 << 9) - 1)); @@ -946,17 +945,14 @@ function crawlHash($string, $raw = false) * * @param string $string word to hash * @param bool $raw whether to base64Hash the result - * @param $meta_string the up to 11 byte string of meta information * @return string first 8 bytes of md5 of $string concatenated with \x00 * to indicate the hash is of a word not a phrase concatenated with the * padded to 11 byte $meta_string. */ -function crawlHashWord($string, $raw = false, $meta_string = "") +function crawlHashWord($string, $raw = false) { $pre_hash = substr(md5($string, true), 0, 8) . - "\x00"; - $meta_string = substr($meta_string, 0, 11); - $pre_hash .= $meta_string; + "\x00" . substr($string, 0, 11); $pre_hash = str_pad($pre_hash, 20, "\x00"); /* low order bytes all 0 -- distinguishes it from a crawlHashPath */ if (!$raw) { @@ -973,24 +969,15 @@ function crawlHashWord($string, $raw = false, $meta_string = "") * maximal. * * @param string $string what to find hashes for - * @param array $metas array of meta word values - * @param array $encode_metas a list of meta word names to encode in word_ids * @param bool $raw whether to base64 the result * @return array of hashes with appropriates shifts if needed */ -function allCrawlHashPaths($string, $metas = [], $encode_metas = [], - $raw = false) +function allCrawlHashPaths($string, $raw = false) { - $mask = ""; - if ($encode_metas != []) { - $mask_num = min(11, count($encode_metas)); - $found_materialized_metas = findMaterialMetas($metas, $encode_metas); - foreach ($encode_metas as $meta) { - $mask .= (isset($found_materialized_metas[$meta])) ? "\xFF": "\x00"; - } - } $pos = -1; $hashes = []; + $last_entry = null; + $new_entry = null; $zero = "*"; $shift = 0; $num_spaces = substr_count($string, " "); @@ -1000,8 +987,7 @@ function allCrawlHashPaths($string, $metas = [], $encode_metas = [], $old_pos = $pos; $path_string = $string; for ($i = 0; $i < $num; $i++) { - $hash = crawlHashPath($path_string, $pos + 1, $metas, - $encode_metas, $raw); + $hash = crawlHashPath($path_string, $pos + 1, $raw); if ($i > 0 && $j > 0) { $path_len = $num_spaces - $j + 1 + $i; if ($path_len < 4) { @@ -1075,97 +1061,23 @@ function allCrawlHashPaths($string, $metas = [], $encode_metas = [], $shift = 64 + 29 * ($i - 12); } } - $hashes[] = [$hash, $shift, $mask]; - } else if ($mask != "") { - $hashes[] = [$hash, $shift, $mask]; + $new_entry = [$hash, $shift]; } else { - $hashes[] = $hash; + $new_entry = [$hash, 0]; + } + if ($new_entry != $last_entry) { + $hashes[] = $new_entry; + } + if ($j == 0) { + break; } - if ($j == 0) {break; } $path_string .= " " . $zero; } $pos = mb_strpos($string, " ", $pos + 1); - $encode_metas = []; $j++; } while($pos > 0 && $old_pos != $pos); - if (count($hashes) == 1) { - return $hashes[0]; - } return $hashes; } -/** - * Give an array of values for meta words (for example, media:video, lang:en) - * and an array of names of meta words to be encoded into word_id's - * (for example, media:, safe:, class:) return an associative array of pairs - * (meta word name =>array(value of that name)) which should be encoded - * into word id's - * - * @param array $metas array of meta word values - * @param array $encode_metas a list of meta word names to encode in word_ids - * @return array $found_materialized_metas associative array of name => - * values for that name - */ -function findMaterialMetas($metas, $encode_metas) -{ - $found_materialized_metas = []; - foreach ($metas as $meta_id) { - if ($encode_metas != []) { - $match_kinds = explode(":", $meta_id); - $next_char = (isset($match_kinds[1][0])) ? $match_kinds[1][0] : - ord('a'); - $is_class = ($match_kinds[0] == 'class'); - if (count($match_kinds) > 1 && - in_array($match_kinds[0].":", $encode_metas) && - !in_array($match_kinds[1], ["all"]) && - !isset($match_kinds[2])) { - $found_materialized_metas[$match_kinds[0].":"][] = - $meta_id; - } - } - } - return $found_materialized_metas; -} -/** - * Give an array of values for meta words (for example, media:video) - * and an array of names of meta words to be encoded into word_id's - * (for example, media:, safe:, class:) returns a string mask for the - * byte positions in a word_id after the 9th byte. The format of a word id - * in the case of a single word is described in the documentation for - * @see crawlHashPath - * - * @param array $metas a list of meta word values extracted from a query - * string or document. - * @param array $encode_metas a list of meta word names that should be encoded - * in word id's For example, (media:, safe:, class:) - * @return string a 9 byte string where encoded meta word values have been - * stored - */ -function encodeMaterialMetas($metas, $encode_metas) -{ - if (!is_array($encode_metas) || empty($encode_metas)) { - return ""; - } - $found_materialized_metas = findMaterialMetas($metas, $encode_metas); - $meta_string = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"; - foreach ($found_materialized_metas as $name => $values) { - foreach ($values as $value) { - if ($name == 'class:' && isset($value[6])) { - $pre_meta_pos = ord($value[6]); - /* - positions for classifier classes start at 2 - */ - $meta_pos = (($pre_meta_pos) % 9) + 2; - } else { - /* m is first char of media, s is first char of s - offset will be 1 if safe, 0 if media - */ - $meta_pos = (ord($name[0]) > ord('m')) ? 1 : 0; - } - $meta_string[$meta_pos] = substr(crawlHash($value, true), 0, 1); - } - } - return $meta_string; -} /** * Given a string makes an 20 byte hash path - where first 8 bytes is * a hash of the string before path start, last 12 bytes is the path @@ -1185,36 +1097,22 @@ function encodeMaterialMetas($metas, $encode_metas) * If $path_start is 0 behaves like crawlHashWord(). The above encoding is * typically used to make word_ids for whole phrases, to make word id's * for single words, the format is - * (64 bits for word, 1 byte null, remaining 11 bytes encode an materialized - * meta words present in document or query string). Of this 11 bytes, - * the first is used for the meta word media:, so if the document is of type - * media:image, then a single byte hash of media:image gives the value of this - * byte. The second byte encodes the meta word safe: in a similar fashion. - * The remaining 9 bytes encode different values of the class: meta word. - * To encode class:some_value., first class:some_value[0] is hashed to a value - * j betwen 0 and 8. Then class:some_value is hash to a single byte b. Then - * the jth value of the remaining bytes is set to b. Non affected bytes are - * null. + * (64 bits for word, 1 byte null, then ignored 11 bytes ). * * @param string $string what to hash * @param int $path_start what to use as the split between 5 byte front * hash and the rest - * @param array $metas meta word values from a document or query string - * @param array $encode_metas a list of names of meta word values which should - * encoded into word ids. i.e., (media:, safe:, class:) or none. * @param bool $raw whether to modified base64 the result * @return string 8 bytes that results from this hash process */ -function crawlHashPath($string, $path_start = 0, $metas = [], - $encode_metas = [], $raw = false) +function crawlHashPath($string, $path_start = 0, $raw = false) { if ($path_start > 0 ) { $string_parts = explode(" ", substr($string, $path_start)); $num_parts = count($string_parts); } if ($path_start == 0 || $num_parts == 0) { - $meta_string = encodeMaterialMetas($metas, $encode_metas); - $hash = crawlHashWord($string, true, $meta_string); + $hash = crawlHashWord($string, true); if (!$raw) { $hash = base64Hash($hash); } @@ -1227,7 +1125,6 @@ function crawlHashPath($string, $path_start = 0, $metas = [], $path_ints = []; $modes = [3, 3, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13]; $mode_nums = [1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]; - foreach ($string_parts as $part) { if ($part == "*") { $path_ints[] = 0; @@ -1236,7 +1133,9 @@ function crawlHashPath($string, $path_start = 0, $metas = [], } } $num_parts = count($path_ints); - if ($num_parts > 13) {$num_parts = 13; } + if ($num_parts > 13) { + $num_parts = 13; + } $mode = $modes[$num_parts]; $mode_num = $mode_nums[$num_parts]; switch ($mode) { @@ -1268,7 +1167,6 @@ function crawlHashPath($string, $path_start = 0, $metas = [], + ($path_ints[4] & $mask)) << $shift) + ($path_ints[5] & $mask)) << $shift) + ($path_ints[6] & $mask); - break; case 9: $path_ints[8] = isset($path_ints[8]) ? $path_ints[8] : 0; @@ -1366,11 +1264,7 @@ function crawlHashPath($string, $path_start = 0, $metas = [], */ function compareWordHashes($id1, $id2, $shift = 0) { - if (!isset($id1[8]) || !isset($id2[8])) { - return strncmp($id1, $id2, 8); - } else if ($id1[8] == "\x00") { - return strncmp($id1, $id2, 9); - } else if ($shift < 32) { + if ($shift < 32) { $cmp = strncmp($id1, $id2, 16); } else if ($shift < 64) { $cmp = strncmp($id1, $id2, 12); @@ -1393,35 +1287,6 @@ function compareWordHashes($id1, $id2, $shift = 0) $id2 = packInt(unpackInt(substr($id2, $pos, 4)) >> $shift); return strcmp($id1, $id2); } -/** - * Check if two word id's match according to a mask of the last 12 bytes. - * - * @param string $word_id 20 byte word id to compare - * @param string $id 20 byte word id to compare - * @param string $mask what mask to use - * @param string $mask_len the length of the mask - * @return bool true if match; false otherwise - */ -function matchingWordMetas($word_id, $id, $mask = "", $mask_len = 0) -{ - if ($mask != "" && strlen($id) > 9 && strlen($word_id) > 9 && - substr_compare($id, $word_id, 9, $mask_len) != 0) { - $k = 0; - $old_k = 0; - while(($k = strpos($mask, "\xFF", $old_k)) !== false) { - $loc = $k + 8; - if (isset($id[$loc]) && $id[$loc] != $word_id[$loc]) { - return false; - break; - } - if ($k == $old_k) { - $k++; - } - $old_k = $k; - } - } - return true; -} /** * Converts a crawl hash number to something closer to base64 coded but * so doesn't get confused in urls or DBs @@ -1983,28 +1848,6 @@ function generalIsA($class_1, $class_2) } return (is_a($class_1, $class_2) || is_subclass_of($class_1, $class_2)); } -/** - * Given an array of arrays acting much like a database table, this - * returns a sequence of key value pairs, where the keys are the distinct - * entries in $key_column and the values are the counts of numbers in - * $count_column for each particular key; - * - * @param array $arr an array of arrays - * @param mixed $key_column (string or int) field name of key column - * @param mixed $count_column (string or int) field name of count column - * @return array key => values pairs of counts - */ -function arrayColumnCount($arr, $key_column, $count_column) -{ - $out_arr = []; - foreach ($arr as $row) { - if (!isset($out_arr[$row[$key_column]])) { - $out_arr[$row[$key_column]] = 0; - } - $out_arr[$row[$key_column]] += $row[$count_column]; - } - return $out_arr; -} /** * Given the contents of a start XML/HMTL tag strips out all the attributes * non listed in $safe_attribute_list diff --git a/src/library/WebArchiveBundle.php b/src/library/WebArchiveBundle.php index 65ab1e202..47e3bc8fb 100755 --- a/src/library/WebArchiveBundle.php +++ b/src/library/WebArchiveBundle.php @@ -118,7 +118,7 @@ class WebArchiveBundle $info = unserialize( file_get_contents($this->dir_name."/description.txt")); } else { - $this->version = 1; + $this->version = C\DEFAULT_CRAWL_FORMAT; } if (isset($info['NUM_DOCS_PER_PARTITION'])) { $this->num_docs_per_partition = $info['NUM_DOCS_PER_PARTITION']; diff --git a/src/library/index_bundle_iterators/IndexBundleIterator.php b/src/library/index_bundle_iterators/IndexBundleIterator.php index 16041c1df..af078e5a4 100644 --- a/src/library/index_bundle_iterators/IndexBundleIterator.php +++ b/src/library/index_bundle_iterators/IndexBundleIterator.php @@ -132,10 +132,6 @@ abstract class IndexBundleIterator implements CrawlConstants if (isset($this->word_key)) { $out .= "Word Key: " . L\toHexString($this->word_key)."\n"; $out .= "Index Name: ".$this->index_name."\n"; - $out .= "Media Type: ".PhraseParser::getMediaType( - $this->word_key) . "\n"; - $out .= "Safe: ". PhraseParser::getSafety($this->word_key) . "\n"; - } $out .= "Number of Docs: ".$this->num_docs; if (isset($this->index_bundle_iterator)) { diff --git a/src/library/index_bundle_iterators/WordIterator.php b/src/library/index_bundle_iterators/WordIterator.php index 1fa4d0cab..5742d9a1f 100644 --- a/src/library/index_bundle_iterators/WordIterator.php +++ b/src/library/index_bundle_iterators/WordIterator.php @@ -49,20 +49,21 @@ use seekquarry\yioop\library\IndexManager; class WordIterator extends IndexBundleIterator { /** - * hash of word that the iterator iterates over + * hash of word or phrase that the iterator iterates over * @var string */ public $word_key; /** - * The timestamp of the index is associated with this iterator - * @var string + * Position from end of key that doesn't have to be an exact match + * (for phrases as using suffix tree) + * @var int */ - public $index_name; + public $shift; /** - * The byte mask to apply against the word id + * The timestamp of the index is associated with this iterator * @var string */ - public $mask; + public $index_name; /** * First shard generation that word info was obtained for * @var int @@ -155,6 +156,8 @@ class WordIterator extends IndexBundleIterator * Creates a word iterator with the given parameters. * * @param string $word_key hash of word or phrase to iterate docs of + * @param string $shift up to what point in key should be a match + * when do dictionary look up (for phrases because using suffix tree) * @param string $index_name time_stamp of the to use * @param bool $raw whether the $word_key is our variant of base64 encoded * @param array $filter an array of hashes of domains to filter from @@ -165,34 +168,33 @@ class WordIterator extends IndexBundleIterator * gotten out of this iterator (may be reordered later). This flag * controls whether an upper bound of self::LIMIT_FEEDS_COUNT is * imposed on the number of feed results returned - * @param string $mask byte mask to apply against word id, default is for - * exact match */ - public function __construct($word_key, $index_name, $raw = false, + public function __construct($word_key, $shift, $index_name, $raw = false, &$filter = null, $results_per_block = IndexBundleIterator::RESULTS_PER_BLOCK, - $limit_feeds = false, - $mask = "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF") + $limit_feeds = false) { if ($raw == false) { //get rid of out modified base64 encoding $word_key = L\unbase64Hash($word_key); } + if (L\crawlHashWord("media:news", true) == $word_key) { + $this->is_news = true; + } if ($filter != null) { $this->filter = & $filter; } else { $this->filter = null; } $this->word_key = $word_key; + $this->shift = $shift; $this->index_name = $index_name; - $this->mask = $mask; list($estimated_total, $this->dictionary_info) = - IndexManager::getWordInfo($index_name, $word_key, 0, - $mask, -1, -1, C\NUM_DISTINCT_GENERATIONS, true); + IndexManager::getWordInfo($index_name, $word_key, $shift, + -1, -1, C\NUM_DISTINCT_GENERATIONS, true); $this->feed_shard_name = C\WORK_DIRECTORY."/feeds/index"; if ((!C\nsdefined('NO_FEEDS') || !C\NO_FEEDS) && file_exists($this->feed_shard_name)) { - //NO_FEEDS defined true in statistic_controller.php $this->use_feeds = true; } else { $this->use_feeds = false; @@ -306,8 +308,8 @@ class WordIterator extends IndexBundleIterator if ($this->start_generation > 0) { list($estimated_total, $this->dictionary_info) = IndexManager::getWordInfo($this->index_name, - $this->word_key, 0, $this->mask, -1, 0, - C\NUM_DISTINCT_GENERATIONS, true); + $this->word_key, 0, -1, 0, C\NUM_DISTINCT_GENERATIONS, + true); $this->num_docs = $this->feed_count + $estimated_total; ksort($this->dictionary_info); $this->dictionary_info = array_values($this->dictionary_info); @@ -362,13 +364,8 @@ class WordIterator extends IndexBundleIterator $this->next_offset, $this->feed_end, $this->results_per_block); $time = time(); - // C1 is the materialized meta for media:news - $is_news = ($this->word_key[9] == "\xC1") ? true : false; foreach ($pre_results as $keys => $pre_result) { $pre_results[$keys][self::IS_FEED] = true; - if ($is_news) { - $pre_results[$keys][self::IS_NEWS] = true; - } $delta = $time - $pre_result[self::SUMMARY_OFFSET]; $pre_results[$keys][self::DOC_RANK] = 720000 / max($delta, 1); @@ -545,8 +542,7 @@ class WordIterator extends IndexBundleIterator $this->generation_pointer >= $this->num_generations) { list($estimated_remaining_total, $info) = IndexManager::getWordInfo($this->index_name, - $this->word_key, 0, - $this->mask, -1, $this->num_generations, + $this->word_key, 0, -1, $this->num_generations, C\NUM_DISTINCT_GENERATIONS, true); if (count($info) > 0) { $this->num_docs = $this->seen_docs + diff --git a/src/library/indexing_plugins/RecipePlugin.php b/src/library/indexing_plugins/RecipePlugin.php index b351be772..65c08b5dc 100644 --- a/src/library/indexing_plugins/RecipePlugin.php +++ b/src/library/indexing_plugins/RecipePlugin.php @@ -429,7 +429,7 @@ class RecipePlugin extends IndexingPlugin implements CrawlConstants $index_archive->dictionary->mergeAllTiers(); $this->db->setWorldPermissionsRecursive( C\CRAWL_DIR.'/cache/'. - self::index_data_base_name.$index_name); + self::index_data_base_name . $index_name); } L\crawlLog("...Recipe plugin finished."); } diff --git a/src/library/media_jobs/FeedsUpdateJob.php b/src/library/media_jobs/FeedsUpdateJob.php index 0f664d1d1..1c9d2137a 100644 --- a/src/library/media_jobs/FeedsUpdateJob.php +++ b/src/library/media_jobs/FeedsUpdateJob.php @@ -591,8 +591,7 @@ class FeedsUpdateJob extends MediaJob $meta_ids = $this->calculateMetas($lang, $item['PUBDATE'], $source_name, $item["GUID"], $media_category); $prune_shard->addDocumentWords($doc_keys, $item['PUBDATE'], - $word_and_qa_lists["WORD_LIST"], $meta_ids, - PhraseParser::$materialized_metas, true, false); + $word_and_qa_lists["WORD_LIST"], $meta_ids, true, false); } } $prune_shard->save(); diff --git a/src/locale/ar/configure.ini b/src/locale/ar/configure.ini index 7afb07372..8b3c5a6e4 100755 --- a/src/locale/ar/configure.ini +++ b/src/locale/ar/configure.ini @@ -800,7 +800,6 @@ search_view_search = "البحث" search_view_no_index_set = "" search_view_calculated = "%s ثوان." search_view_results = "عرض %s- %s من %s" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "مؤقتاً" @@ -810,7 +809,6 @@ search_view_inlink = "Inlinks" search_view_rank = "رتبة: %s" search_view_relevancy = "ق Rel:%" search_view_proximity = "ق Prox:%" -search_view_thesaurus_score = "" search_view_score = "نقاط: %s" ; ; /src/views/elements diff --git a/src/locale/bn/configure.ini b/src/locale/bn/configure.ini index 22371535a..2e6aca5ee 100755 --- a/src/locale/bn/configure.ini +++ b/src/locale/bn/configure.ini @@ -800,7 +800,6 @@ search_view_search = "" search_view_no_index_set = "" search_view_calculated = "" search_view_results = "" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "" @@ -810,7 +809,6 @@ search_view_inlink = "" search_view_rank = "" search_view_relevancy = "" search_view_proximity = "" -search_view_thesaurus_score = "" search_view_score = "" ; ; /src/views/elements diff --git a/src/locale/de/configure.ini b/src/locale/de/configure.ini index 44abbd47d..1b5fc1df9 100755 --- a/src/locale/de/configure.ini +++ b/src/locale/de/configure.ini @@ -800,7 +800,6 @@ search_view_search = "Suche" search_view_no_index_set = "" search_view_calculated = "" search_view_results = "" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "" @@ -810,7 +809,6 @@ search_view_inlink = "" search_view_rank = "" search_view_relevancy = "" search_view_proximity = "" -search_view_thesaurus_score = "" search_view_score = "" ; ; /src/views/elements diff --git a/src/locale/en_US/configure.ini b/src/locale/en_US/configure.ini index 4d8ecbda9..e1ea2704d 100644 --- a/src/locale/en_US/configure.ini +++ b/src/locale/en_US/configure.ini @@ -800,7 +800,6 @@ search_view_search = "Search" search_view_no_index_set = "No Default Index Set" search_view_calculated = "%s seconds." search_view_results = "Showing %s - %s of %s" -search_view_thesaurus_results = "Thesaurus Results" search_view_possible_answer = "Possible Answer:" search_view_word_cloud = "Words:" search_view_cache = "Cached" @@ -810,7 +809,6 @@ search_view_inlink = "Inlinks" search_view_rank = "Rank:%s " search_view_relevancy = "Rel:%s " search_view_proximity = "Prox:%s" -search_view_thesaurus_score = "Thesaurus: %s" search_view_score = "Score:%s" ; ; /src/views/elements diff --git a/src/locale/en_US/resources/Tokenizer.php b/src/locale/en_US/resources/Tokenizer.php index 037134aa1..c1a18ede7 100755 --- a/src/locale/en_US/resources/Tokenizer.php +++ b/src/locale/en_US/resources/Tokenizer.php @@ -32,18 +32,11 @@ use seekquarry\yioop\configs as C; use seekquarry\yioop\library\PhraseParser; use seekquarry\yioop\library as L; -/* If you would like to use wordnet for thesaurus reordering of query results - define the following variable in your configs/local_config.php file with - the path to the WordNet executable. - */ -if (!C\nsdefined("WORDNET_EXEC")) { - C\nsdefine("WORDNET_EXEC", ""); -} /** * This class has a collection of methods for English locale specific * tokenization. In particular, it has a stemmer, a stop word remover (for - * use mainly in word cloud creation), and a part of speech tagger (if - * thesaurus reordering used). The stemmer is my stab at implementing the + * use mainly in word cloud creation), and a part of speech tagger (for + * question answering). The stemmer is my stab at implementing the * Porter Stemmer algorithm * presented http://tartarus.org/~martin/PorterStemmer/def.txt * The code is based on the non-thread safe C version given by Martin Porter. @@ -115,16 +108,10 @@ class Tokenizer */ private static $j; /** - * The constructor for a tokenizer can be used to say that a thesaurus - * for final query reordering is present. For english we do this if - * the WORDNET_EXEC variable is set. In which case we use WordNet for - * our reordering + * Do any global set up for tokenizer (none in the case of en-US) */ public function __construct() { - if (C\WORDNET_EXEC != "") { - $this->use_thesaurus = true; - } } /** * Stub function which could be used for a word segmenter. @@ -139,91 +126,6 @@ class Tokenizer { return $pre_segment; } - /** - * Computes similar words and scores from WordNet output based on word - * type. - * - * @param string $term term to find related thesaurus terms - * @param string $word_type is the type of word such as "NN" (noun), - * "VB" (verb), "AJ" (adjective), or "AV" (adverb) - * (all other types will be ignored) - * @param string $whole_query the original query $term came from - * @return array a sequence of - * (score => array of thesaurus terms) associations. The score - * representing one word sense of term - */ - public static function scoredThesaurusMatches($term, $word_type, - $whole_query) - { - $word_map = ["VB" => "verb", "NN" => "noun", "AJ" => "adj", - "AV" => "adv"]; - //Gets overview of senses of term[$i] into data - exec(C\WORDNET_EXEC . " $term -over", $data); - if (!$data || ! isset($word_map[$word_type])) { return null; } - $full_name = $word_map[$word_type]; - $lexicon_output = implode("\n", $data); - $sense_parts = preg_split("/\bThe\s$full_name".'[^\n]*\n\n/', - $lexicon_output); - if (!isset($sense_parts[1])) {return null; } - list($sense, ) = preg_split("/\bOverview\sof\s/", $sense_parts[1]); - $definitions_for_sense = preg_split("/\d+\.\s/", $sense, -1, - PREG_SPLIT_NO_EMPTY); - $num_definitions = count($definitions_for_sense); - $sentence = []; - $similar_phrases = []; - $avg_scores = []; - for ($i = 0; $i < $num_definitions; $i++) { - //get sentence fragments examples of using that definition - preg_match_all('/\"(.*?)\"/', $definitions_for_sense[$i], - $matches); - // to separate out the words - preg_match('/[\w+\s\,\.\']+\s\-+/', $definitions_for_sense[$i], - $match_word); - $thesaurus_phrases = preg_split("/\s*\,\s*/", - strtolower(rtrim(trim($match_word[0]), "-"))); - //remove ori ginal term from thesaurus phrases if present - $m = 0; - foreach ($thesaurus_phrases as $thesaurus_phrase) { - $tphrase = trim($thesaurus_phrase); - if ($tphrase == trim($term)) { - unset($thesaurus_phrases[$m]); - } - $m++; - } - $thesaurus_phrases = array_filter($thesaurus_phrases); - if ($thesaurus_phrases == []) {continue;} - $num_example_sentences = count($matches[1]); - $score = []; - for ($j = 0; $j < $num_example_sentences; $j++) { - $query_parts = explode(' ', strtolower($whole_query)); - $example_sentence_parts = explode(' ', - strtolower($matches[1][$j])); - $score[$j] = PhraseParser::getCosineRank($query_parts, - $example_sentence_parts); - /* If Cosine similarity is zero then go for - * intersection similarity ranking - */ - if ($score[$j] == 0) { - $score[$j] = PhraseParser::getIntersection($query_parts, - $example_sentence_parts); - } - } - /* We use the rounded average of the above times 100 as a score - score for a definition. To avoid ties we store in the low - order digits 99 - the definition it was - */ - if ($num_example_sentences > 0) { - $definition_score = 100 * round( - 100 * (array_sum($score) / $num_example_sentences)) - + (99 - $i); - } else { - $definition_score = 99 - $i; - } - $similar_phrases[$definition_score] = $thesaurus_phrases; - } - krsort($similar_phrases); - return $similar_phrases; - } /** * Removes the stop words from the page (used for Word Cloud generation) * @@ -320,7 +222,7 @@ class Tokenizer 'theyve','think','this','those', 'thou','though','thoughh','thousand','throug', 'through','throughout','thru', - 'thus','til','tip','to','together','too', + 'thus','til', 'till','tip','to','together','too', 'took','toward','towards','tried', 'tries','truly','try','trying','ts','twice','two','u','un','under', 'unfortunately','unless','unlike','unlikely','until','unto','up', diff --git a/src/locale/es/configure.ini b/src/locale/es/configure.ini index 57efd1d02..450e51af0 100755 --- a/src/locale/es/configure.ini +++ b/src/locale/es/configure.ini @@ -800,7 +800,6 @@ search_view_search = "Buscar" search_view_no_index_set = "" search_view_calculated = "" search_view_results = "" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "" @@ -810,7 +809,6 @@ search_view_inlink = "" search_view_rank = "" search_view_relevancy = "" search_view_proximity = "" -search_view_thesaurus_score = "" search_view_score = "" ; ; /src/views/elements diff --git a/src/locale/fa/configure.ini b/src/locale/fa/configure.ini index c73c7a95b..6c9aacf88 100755 --- a/src/locale/fa/configure.ini +++ b/src/locale/fa/configure.ini @@ -800,7 +800,6 @@ search_view_search = "جستجو" search_view_no_index_set = "" search_view_calculated = "%s ثانیه" search_view_results = "در حال نمایش %s - %s از %s" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "کش شده" @@ -810,7 +809,6 @@ search_view_inlink = "پیوندهای داخلی" search_view_rank = "رتبه: %s" search_view_relevancy = "ارتباط: %s" search_view_proximity = "نزدیکی: %s" -search_view_thesaurus_score = "" search_view_score = "امتیاز: %s" ; ; /src/views/elements diff --git a/src/locale/fr_FR/configure.ini b/src/locale/fr_FR/configure.ini index 09574c152..84278455a 100755 --- a/src/locale/fr_FR/configure.ini +++ b/src/locale/fr_FR/configure.ini @@ -800,7 +800,6 @@ search_view_search = "Rechercher" search_view_no_index_set = "" search_view_calculated = "%s secondes." search_view_results = "Affichage de %s - %s sur %s résultats" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "En Cache" @@ -810,7 +809,6 @@ search_view_inlink = "Liens retour" search_view_rank = "Rang: %s" search_view_relevancy = "Pertinence: %s" search_view_proximity = "Proximité: %s" -search_view_thesaurus_score = "" search_view_score = "Total: %s" ; ; /src/views/elements diff --git a/src/locale/he/configure.ini b/src/locale/he/configure.ini index c8fa6b01b..2a9cbfffd 100755 --- a/src/locale/he/configure.ini +++ b/src/locale/he/configure.ini @@ -800,7 +800,6 @@ search_view_search = "חפש" search_view_no_index_set = "" search_view_calculated = "" search_view_results = "" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "" @@ -810,7 +809,6 @@ search_view_inlink = "" search_view_rank = "" search_view_relevancy = "" search_view_proximity = "" -search_view_thesaurus_score = "" search_view_score = "" ; ; /src/views/elements diff --git a/src/locale/hi/configure.ini b/src/locale/hi/configure.ini index 5d9b1dba4..26f554ce0 100755 --- a/src/locale/hi/configure.ini +++ b/src/locale/hi/configure.ini @@ -800,7 +800,6 @@ search_view_search = "खोज" search_view_no_index_set = "" search_view_calculated = "" search_view_results = "" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "" @@ -810,7 +809,6 @@ search_view_inlink = "" search_view_rank = "" search_view_relevancy = "" search_view_proximity = "" -search_view_thesaurus_score = "" search_view_score = "" ; ; /src/views/elements diff --git a/src/locale/in_ID/configure.ini b/src/locale/in_ID/configure.ini index 2acc2dd67..9af7f978f 100755 --- a/src/locale/in_ID/configure.ini +++ b/src/locale/in_ID/configure.ini @@ -800,7 +800,6 @@ search_view_search = "Cari" search_view_no_index_set = "" search_view_calculated = "" search_view_results = "Hasil" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "" @@ -810,7 +809,6 @@ search_view_inlink = "" search_view_rank = "Urutan" search_view_relevancy = "" search_view_proximity = "" -search_view_thesaurus_score = "" search_view_score = "" ; ; /src/views/elements diff --git a/src/locale/it/configure.ini b/src/locale/it/configure.ini index 2004095e2..c8e0324b3 100755 --- a/src/locale/it/configure.ini +++ b/src/locale/it/configure.ini @@ -800,7 +800,6 @@ search_view_search = "Cerca" search_view_no_index_set = "" search_view_calculated = "Calccolati in %s secondi." search_view_results = "Mostra risultati %s - %s di %s" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "Archivio" @@ -810,7 +809,6 @@ search_view_inlink = "Inlink" search_view_rank = "Pos.: %s " search_view_relevancy = "Rel: %s " search_view_proximity = "Pros: %s" -search_view_thesaurus_score = "" search_view_score = "Punteggio %s" ; ; /src/views/elements diff --git a/src/locale/ja/configure.ini b/src/locale/ja/configure.ini index c07c80463..91ecd6c6d 100755 --- a/src/locale/ja/configure.ini +++ b/src/locale/ja/configure.ini @@ -800,7 +800,6 @@ search_view_search = "検索" search_view_no_index_set = "" search_view_calculated = "%s分で計算しました。" search_view_results = "結果表示%s ー %s の %s" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "キャッシューしました。" @@ -810,7 +809,6 @@ search_view_inlink = "" search_view_rank = "ランク:%s" search_view_relevancy = "関連:%s" search_view_proximity = "近さ: %s" -search_view_thesaurus_score = "" search_view_score = "スコア %s" ; ; /src/views/elements diff --git a/src/locale/kn/configure.ini b/src/locale/kn/configure.ini index 66abefcb7..de6e899c9 100755 --- a/src/locale/kn/configure.ini +++ b/src/locale/kn/configure.ini @@ -800,7 +800,6 @@ search_view_search = "ಹುಡುಕು" search_view_no_index_set = "" search_view_calculated = "ಲೆಕ್ಕಾಚಾರದ ಸಮಯ %s ಸೆಕೆಂಡು" search_view_results = "ತೋರಿಸುತ್ತಿರುವ ಫಲಿತಾಂಶಗಳು %s - %s ಆಫ್ %s" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "ಸಿದ್ಧ ಸ್ಮೃತಿಕೋಶದಿಂದ ನೋಡಿ" @@ -810,7 +809,6 @@ search_view_inlink = "ಒಳ ಕೊಂಡಿ" search_view_rank = "ಸ್ಥಾನ: %s" search_view_relevancy = "ಪ್ರಾಸ್ತಾವಿಕ: %s" search_view_proximity = "ಸಾನಿಧ್ಯ: %s" -search_view_thesaurus_score = "" search_view_score = "ಅಂಕ: %s " ; ; /src/views/elements diff --git a/src/locale/ko/configure.ini b/src/locale/ko/configure.ini index 1734b5a6b..ef554515d 100755 --- a/src/locale/ko/configure.ini +++ b/src/locale/ko/configure.ini @@ -800,7 +800,6 @@ search_view_search = "검색" search_view_no_index_set = "" search_view_calculated = "%s 초 결과 완료" search_view_results = "결과 %s - %s 의 %s" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "캐시 됀것" @@ -810,7 +809,6 @@ search_view_inlink = "인링크" search_view_rank = "랭크: %s" search_view_relevancy = "관련성: %s " search_view_proximity = "" -search_view_thesaurus_score = "" search_view_score = "점수 %s" ; ; /src/views/elements diff --git a/src/locale/nl/configure.ini b/src/locale/nl/configure.ini index bf4a694da..79f67927c 100644 --- a/src/locale/nl/configure.ini +++ b/src/locale/nl/configure.ini @@ -800,7 +800,6 @@ search_view_search = "zoeken" search_view_no_index_set = "Geen Standaard Index Set" search_view_calculated = " %s seconden." search_view_results = "Toont %s - %s van %s" -search_view_thesaurus_results = "thesaurus Resultaten" search_view_possible_answer = "" search_view_word_cloud = "woorden:" search_view_cache = "gecached" @@ -810,7 +809,6 @@ search_view_inlink = "inlinks" search_view_rank = "Rang: %s" search_view_relevancy = "Rel: %s" search_view_proximity = "Prox: %s" -search_view_thesaurus_score = "Thesaurus: %s" search_view_score = "Score: %s" ; ; /src/views/elements diff --git a/src/locale/pl/configure.ini b/src/locale/pl/configure.ini index 42144e947..e632fa466 100755 --- a/src/locale/pl/configure.ini +++ b/src/locale/pl/configure.ini @@ -800,7 +800,6 @@ search_view_search = "Szukaj" search_view_no_index_set = "" search_view_calculated = "" search_view_results = "" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "" @@ -810,7 +809,6 @@ search_view_inlink = "" search_view_rank = "" search_view_relevancy = "" search_view_proximity = "" -search_view_thesaurus_score = "" search_view_score = "" ; ; /src/views/elements diff --git a/src/locale/pt/configure.ini b/src/locale/pt/configure.ini index a703975b6..52ae0b154 100755 --- a/src/locale/pt/configure.ini +++ b/src/locale/pt/configure.ini @@ -800,7 +800,6 @@ search_view_search = "Pesquisa" search_view_no_index_set = "" search_view_calculated = "" search_view_results = "" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "" @@ -810,7 +809,6 @@ search_view_inlink = "" search_view_rank = "" search_view_relevancy = "" search_view_proximity = "" -search_view_thesaurus_score = "" search_view_score = "" ; ; /src/views/elements diff --git a/src/locale/ru/configure.ini b/src/locale/ru/configure.ini index 345a881a0..cdacf6e7d 100755 --- a/src/locale/ru/configure.ini +++ b/src/locale/ru/configure.ini @@ -800,7 +800,6 @@ search_view_search = "Поиск" search_view_no_index_set = "" search_view_calculated = "" search_view_results = "" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "" @@ -810,7 +809,6 @@ search_view_inlink = "" search_view_rank = "" search_view_relevancy = "" search_view_proximity = "" -search_view_thesaurus_score = "" search_view_score = "" ; ; /src/views/elements diff --git a/src/locale/te/configure.ini b/src/locale/te/configure.ini index f7f7d2c51..b4ec09493 100644 --- a/src/locale/te/configure.ini +++ b/src/locale/te/configure.ini @@ -800,7 +800,6 @@ search_view_search = "అన్వేషించు" search_view_no_index_set = "డిఫాల్ట్ సూచిక సెట్ చేసి లేదు" search_view_calculated = "%s సెకన్లు" search_view_results = "చూపించేది %s - %s of %s" -search_view_thesaurus_results = "థెసారస్ ఫలితాలు" search_view_possible_answer = "" search_view_word_cloud = "వర్డ్స్:" search_view_cache = "కేష్ చేయబడినవి" @@ -810,7 +809,6 @@ search_view_inlink = "ఇన్ లింక్స్" search_view_rank = "రేంక్:%s" search_view_relevancy = "సంబంధిత:%s" search_view_proximity = "సామీప్యత:%s" -search_view_thesaurus_score = "థెసారస్: %s" search_view_score = "స్కోర్:%s" ; ; /src/views/elements diff --git a/src/locale/th/configure.ini b/src/locale/th/configure.ini index bb94f8fbb..bfeaa253c 100755 --- a/src/locale/th/configure.ini +++ b/src/locale/th/configure.ini @@ -800,7 +800,6 @@ search_view_search = "" search_view_no_index_set = "" search_view_calculated = "" search_view_results = "" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "" @@ -810,7 +809,6 @@ search_view_inlink = "" search_view_rank = "" search_view_relevancy = "" search_view_proximity = "" -search_view_thesaurus_score = "" search_view_score = "" ; ; /src/views/elements diff --git a/src/locale/tr/configure.ini b/src/locale/tr/configure.ini index 7488d07d0..bfc4b6699 100755 --- a/src/locale/tr/configure.ini +++ b/src/locale/tr/configure.ini @@ -800,7 +800,6 @@ search_view_search = "Ara" search_view_no_index_set = "" search_view_calculated = "" search_view_results = "" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "" @@ -810,7 +809,6 @@ search_view_inlink = "" search_view_rank = "" search_view_relevancy = "" search_view_proximity = "" -search_view_thesaurus_score = "" search_view_score = "" ; ; /src/views/elements diff --git a/src/locale/vi_VN/configure.ini b/src/locale/vi_VN/configure.ini index 5e46a5366..453814424 100755 --- a/src/locale/vi_VN/configure.ini +++ b/src/locale/vi_VN/configure.ini @@ -800,7 +800,6 @@ search_view_search = "Tìm Kiếm" search_view_no_index_set = "" search_view_calculated = "%s giây." search_view_results = "Cho kết quả tứ %s - %s của %s" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "Trang gốc" @@ -810,7 +809,6 @@ search_view_inlink = "" search_view_rank = "Thứ Tự: %s" search_view_relevancy = "Thích hợp: %s" search_view_proximity = "Gần: %s" -search_view_thesaurus_score = "" search_view_score = "Điểm: %s" ; ; /src/views/elements diff --git a/src/locale/zh_CN/configure.ini b/src/locale/zh_CN/configure.ini index 1bf771e54..c26ccdc1a 100755 --- a/src/locale/zh_CN/configure.ini +++ b/src/locale/zh_CN/configure.ini @@ -800,7 +800,6 @@ search_view_search = "搜尋" search_view_no_index_set = "" search_view_calculated = "總計: %s 秒" search_view_results = "結果" -search_view_thesaurus_results = "" search_view_possible_answer = "" search_view_word_cloud = "" search_view_cache = "" @@ -810,7 +809,6 @@ search_view_inlink = "" search_view_rank = "排名: %s 名" search_view_relevancy = "關聯度: %s 趴" search_view_proximity = "" -search_view_thesaurus_score = "" search_view_score = "分數" ; ; /src/views/elements diff --git a/src/models/ParallelModel.php b/src/models/ParallelModel.php index 86a47b0bb..aa9ef9480 100755 --- a/src/models/ParallelModel.php +++ b/src/models/ParallelModel.php @@ -369,15 +369,14 @@ class ParallelModel extends Model if (!isset($index_archive->generation_info['ACTIVE'])) { return false; } - $mask = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"; $num_generations = $index_archive->generation_info['ACTIVE']; - $hash_key = ($is_key) ? L\crawlHashWord($url_or_key, true, $mask) : - L\crawlHashWord("info:$url_or_key", true, $mask); - $info = IndexManager::getWordInfo($index_name, $hash_key, 0, $mask, 1); + $hash_key = ($is_key) ? L\crawlHashWord($url_or_key, true) : + L\crawlHashWord("info:$url_or_key", true); + $info = IndexManager::getWordInfo($index_name, $hash_key, 0, 1); if (!isset($info[0][4])) { return false; } - $word_iterator = new WordIterator($info[0][4], $index_name, true); + $word_iterator = new WordIterator($info[0][4], 0, $index_name, true); if (is_array($next_docs = $word_iterator->nextDocsWithWord())) { $doc_info = current($next_docs); if (!$doc_info) { diff --git a/src/models/PhraseModel.php b/src/models/PhraseModel.php index 17dd9532a..63e457a83 100755 --- a/src/models/PhraseModel.php +++ b/src/models/PhraseModel.php @@ -35,7 +35,6 @@ use seekquarry\yioop\library as L; use seekquarry\yioop\library\AnalyticsManager; use seekquarry\yioop\library\IndexManager; use seekquarry\yioop\library\PhraseParser; -use seekquarry\yioop\library\Thesaurus; use seekquarry\yioop\library\index_bundle_iterators as I; /** @@ -494,10 +493,6 @@ class PhraseModel extends ParallelModel $results['TOTAL_ROWS'] > 0) { $output = $this->formatPageResults($results, $format_words, $description_length); - if (isset($out_results['THESAURUS_VARIANTS'])) { - $output['THESAURUS_VARIANTS'] = - $out_results['THESAURUS_VARIANTS']; - } if (!empty($answer_score_map)) { arsort($answer_score_map); reset($answer_score_map); @@ -534,10 +529,10 @@ class PhraseModel extends ParallelModel $in2 = $indent . $indent; $in3 = $in2 . $indent; $in4 = $in2. $in2; - $phrase = " ".$phrase; + $phrase = " " . $phrase; $phrase = $this->parseIfConditions($phrase); $phrase_string = $phrase; - list($found_metas, $found_materialized_metas, $disallow_phrases, + list($found_metas, $disallow_phrases, $phrase_string, $query_string, $index_name, $weight) = $this->extractMetaWordInfo($phrase); /* @@ -585,8 +580,7 @@ class PhraseModel extends ParallelModel $new_words = PhraseParser::extractPhrases($phrase_part, $locale_tag, $index_name); - if (isset($new_words[0]) && strpos($new_words[0], " ") > 0 && - $found_materialized_metas == []) { + if (isset($new_words[0]) && strpos($new_words[0], " ") > 0) { array_pop($new_words); } $base_words = array_merge($base_words, $new_words); @@ -597,11 +591,6 @@ class PhraseModel extends ParallelModel //stemmed, if have stemmer $index_version = IndexManager::getVersion($index_name); $add_metas = $found_metas; - $immaterial_metas = array_diff( - $found_metas, $found_materialized_metas); - if (count($immaterial_metas) > 0 && $index_version > 0) { - $add_metas = $immaterial_metas; - } $words = array_merge($base_words, $add_metas); if (count($words) == 0 && count($disallow_phrases) > 0) { $words[] = "site:any"; @@ -637,20 +626,7 @@ class PhraseModel extends ParallelModel if (isset($words) && count($words) == 1 && count($disallow_phrases) < 1 && !strpos($words[0], " ")) { $phrase_string = $words[0]; - if ($index_version == 0) { - $tmp_hash = L\allCrawlHashPaths($phrase_string); - $tmp_hash = (is_array($tmp_hash)) ? $tmp_hash : [$tmp_hash]; - $phrase_hash = array_merge([$tmp_hash], - [L\crawlHash($phrase_string)]); - } else { - if ($found_materialized_metas == []) { - $phrase_hash = L\allCrawlHashPaths($phrase_string); - } else { - $phrase_hash = L\allCrawlHashPaths($phrase_string, - $found_materialized_metas, - PhraseParser::$materialized_metas); - } - } + $phrase_hash = L\allCrawlHashPaths($phrase_string); $word_struct = ["KEYS" => [$phrase_hash], "QUOTE_POSITIONS" => null, "DISALLOW_KEYS" => [], "WEIGHT" => $weight, "INDEX_NAME" => $index_name, @@ -658,32 +634,9 @@ class PhraseModel extends ParallelModel } else { //get a raw list of words and their hashes $hashes = []; - $metas_accounted = false; - $materialized_metas = []; - $meta_keys = []; $word_keys = []; foreach ($words as $word) { - if (!$metas_accounted && substr_count($word, " ") == 0 - && !in_array($word, $found_metas)) { - $metas_accounted = true; - $materialized_metas = $found_materialized_metas; - } - $tmp_hash = L\allCrawlHashPaths($word, $materialized_metas, - PhraseParser::$materialized_metas); - if ($index_version == 0) { - $tmp_hash = (is_array($tmp_hash)) ? $tmp_hash : [$tmp_hash]; - $test = array_merge($tmp_hash, [L\crawlHash($word)]); - } else { - if (in_array($word, $found_materialized_metas) && - !$metas_accounted) { - $meta_keys[] = $tmp_hash; - } else { - $word_keys[] = $tmp_hash; - } - } - } - if (!$metas_accounted) { - $word_keys = array_merge($word_keys, $meta_keys); + $word_keys[] = L\allCrawlHashPaths($word); } if (count($word_keys) == 0) { $word_keys = null; @@ -774,7 +727,6 @@ class PhraseModel extends ParallelModel $index_name = $this->index_name; $weight = 1; $found_metas = []; - $found_materialized_metas = []; $disallow_phrases = []; $phrase_string = $phrase; $phrase_string = str_replace("&", "&", $phrase_string); @@ -791,24 +743,6 @@ class PhraseModel extends ParallelModel ['i:', 'index:', 'w:', 'weight:', '\-'])) { $matches = $matches[2]; $found_metas = array_merge($found_metas, $matches); - if (in_array($meta_word, PhraseParser::$materialized_metas)) { - $seen_matches = []; - $seen_match_count = 0; - foreach ($matches as $pre_material_match) { - $match_kinds = explode(":", $pre_material_match); - if (!in_array($match_kinds[1], ["all"]) && - !isset($match_kinds[2])) { - $found_materialized_metas[] = $pre_material_match; - if ($seen_match_count > 0 && - !isset($seen_matches[$pre_material_match])) { - $materialized_match_conflict = true; - break 2; - } - $seen_matches[$pre_material_match] = true; - $seen_match_count++; - } - } - } } elseif ($meta_word == '\-') { if (count($matches[0]) > 0) { foreach ($matches[2] as $disallowed) { @@ -828,28 +762,18 @@ class PhraseModel extends ParallelModel } if ($materialized_match_conflict) { $found_metas = []; - $found_materialized_metas = []; $disallow_phrases = []; $phrase_string = ""; } $found_metas = array_unique($found_metas); - $found_materialized_metas = array_unique($found_materialized_metas); - if (empty(trim($phrase_string)) && count($found_metas) == 2 - && (in_array("site:doc", $found_metas) - || in_array("site:any", $found_metas))) { - /*site:doc and site:any doesn't work with materialized metas by - themselves */ - array_pop($found_materialized_metas); - } $disallow_phrases = array_unique($disallow_phrases); $phrase_string = mb_ereg_replace("&", "_and_", $phrase_string); $query_string = mb_ereg_replace(C\PUNCT, " ", $phrase_string); $query_string = preg_replace("/(\s)+/", " ", $query_string); $query_string = mb_ereg_replace('_and_', '&', $query_string); $phrase_string = mb_ereg_replace('_and_', '&', $phrase_string); - return [$found_metas, $found_materialized_metas, - $disallow_phrases, $phrase_string, $query_string, $index_name, - $weight]; + return [$found_metas, $disallow_phrases, $phrase_string, $query_string, + $index_name, $weight]; } /** * Ideally, this function tries to guess from the query what the @@ -1149,7 +1073,7 @@ class PhraseModel extends ParallelModel $save_timestamp_name == "") { $mem_tmp = serialize($raw).serialize($word_structs). $original_query . $this->index_name; - $summary_hash = L\crawlHash($mem_tmp.":".$limit.":".$num); + $summary_hash = L\crawlHash($mem_tmp . ":" . $limit . ":" . $num); if ($use_cache_if_allowed) { $cache_success = true; $results = self::$cache->get($summary_hash); @@ -1405,62 +1329,12 @@ class PhraseModel extends ParallelModel } $results['PAGES'] = $out_pages; $results['TIME'] = time(); - $lang = L\guessLocaleFromString($original_query); - $tokenizer = PhraseParser::getTokenizer($lang); - //only use tokenizer if no meta word or disjuncts in query - if (!preg_match('/(\||\:)/u', $original_query) && - $tokenizer && method_exists($tokenizer, "scoredThesaurusMatches") - && method_exists($tokenizer, "tagPartsOfSpeechPhrase") - && isset($tokenizer->use_thesaurus)) { - $results = $this->sortByThesaurusScore($results, $original_query, - $lang); - if (!$out_pages) { - $results['PAGES'] = $out_pages; - } - } if (!empty($_SERVER["USE_CACHE"]) && $save_timestamp_name == "") { self::$cache->set($summary_hash, $results); } return $results; } - /** - * If user selects Wordnet feature in page options then only - * do WordNet processing. Also user has to specify the WordNet directory - * - * @param array $results document summaries - * @param string $original_query the original query that we are computing - * results for - * @param string $lang locale tag of query - * @return array results document summaries sorted by wordnet score - */ - public function sortByThesaurusScore($results, $original_query, $lang) - { - $summaries = []; - $pages = $results['PAGES']; - foreach ($pages as $page) { - $summaries[] = $page[self::DESCRIPTION]; - } - $index_name = $this->index_name; - $phrases = Thesaurus::getSimilarPhrases($original_query, $index_name, - $lang); - $results['THESAURUS_VARIANTS'] = $phrases; - if (!empty($phrases)) { - $thesaurus_scores = Thesaurus::scorePhrasesSummaries($phrases, - $summaries); - //Store the BM25 score for each page in result array - $num_scores = count($thesaurus_scores); - for ($i = 0; $i < $num_scores; $i++) { - $pages[$i][self::THESAURUS_SCORE] = $thesaurus_scores[$i]; - L\orderCallback($pages[$i], $pages[$i], self::THESAURUS_SCORE); - } - if (array_sum($thesaurus_scores) != 0) { - usort($pages, C\NS_LIB . "orderCallback"); - } - $results['PAGES'] = $pages; - } - return $results; - } /** * Used to lookup summary info for the pages provided (using their) * self::SUMMARY_OFFSET field. If any of the lookup-ed summaries @@ -1601,7 +1475,7 @@ class PhraseModel extends ParallelModel */ public function getQueryIterator($word_structs, &$filter, $raw, &$to_retrieve, $queue_servers = [], $original_query = "", - $save_timestamp_name="", $limit_feeds = true) + $save_timestamp_name = "", $limit_feeds = true) { $iterators = []; $total_iterators = 0; @@ -1671,6 +1545,7 @@ class PhraseModel extends ParallelModel continue; } $sum = 0; + $lookup_cutoff = max(C\MIN_RESULTS_TO_GROUP, $to_retrieve); for ($i = 0; $i < $total_iterators; $i++) { $current_key = (is_string($distinct_word_keys[$i])) ? $distinct_word_keys[$i] : (is_string( @@ -1686,74 +1561,25 @@ class PhraseModel extends ParallelModel $min_group_override = true; } else { //can happen if exact phrase search suffix approach used - if (isset($distinct_word_keys[$i][0][0]) && - is_array($distinct_word_keys[$i][0][0])) { - $distinct_keys = [ - $distinct_word_keys[$i][0][1]]; - } elseif (isset($distinct_word_keys[$i][0]) && + if (isset($distinct_word_keys[$i][0]) && is_array($distinct_word_keys[$i][0])) { $distinct_keys = $distinct_word_keys[$i]; } else { $distinct_keys = [$distinct_word_keys[$i]]; } - $out_keys = []; - $old_distinct_key_id = ""; - foreach ($distinct_keys as $distinct_key) { - if (is_array($distinct_key)) { - if (!isset($distinct_key[2]) && - isset($distinct_key[1])) { - $distinct_keys[] = $distinct_key[1]; - } - $shift = (isset($distinct_key[1])) ? - $distinct_key[1] : 0; - $mask = (isset($distinct_key[2])) ? - $distinct_key[2] : "\x00\x00\x00\x00\x00" . - "\x00\x00\x00\x00\x00\x00"; - if (isset($distinct_key[3])) { - $old_distinct_key_id = - L\unbase64Hash($distinct_key[3]); - } - $distinct_key_id = L\unbase64Hash( - $distinct_key[0]); - } else { - $shift = 0; - $mask = "\x00\x00\x00\x00\x00" . - "\x00\x00\x00\x00\x00\x00"; - $distinct_key_id = - L\unbase64Hash($distinct_key); - } - $lookup_cutoff = max(C\MIN_RESULTS_TO_GROUP, - $to_retrieve); - $info = IndexManager::getWordInfo($index_name, - $distinct_key_id, $shift, $mask, -1, -1, - C\NUM_DISTINCT_GENERATIONS); - if ($old_distinct_key_id != "") { - $old_info = IndexManager::getWordInfo( - $index_name, $old_distinct_key_id, $shift, - $mask, -1, -1, C\NUM_DISTINCT_GENERATIONS); - if ($info !== false && $old_info !== false) { - $info = array_merge($info, $old_info); - } elseif ($old_info !== false) { - $info = $old_info; - } - } - if ($info != []) { - $tmp_keys = L\arrayColumnCount($info, 4, 3); - $sum += array_sum($tmp_keys); - $out_keys = array_merge($out_keys, $tmp_keys); - } - if ($sum > $lookup_cutoff) { - break; - } - } - $out_keys = array_keys(array_slice($out_keys, 0, 50)); + $sum = 0; $tmp_word_iterators =[]; $m = 0; - foreach ($out_keys as $distinct_key) { + foreach ($distinct_keys as $distinct_key) { + $shift = (isset($distinct_key[1])) ? + $distinct_key[1] : 0; + $distinct_key_id = L\unbase64Hash( + $distinct_key[0]); $tmp_word_iterators[$m] = - new I\WordIterator($distinct_key, + new I\WordIterator($distinct_key_id, $shift, $index_name, true, $filter, $to_retrieve, $limit_feeds); + $sum += $tmp_word_iterators[$m]->num_docs; if ($tmp_word_iterators[$m]->dictionary_info != [] || $tmp_word_iterators[$m]->feed_count > 0) { @@ -1762,6 +1588,9 @@ class PhraseModel extends ParallelModel } else { unset($tmp_word_iterators[$m]); } + if ($sum > $lookup_cutoff) { + break; + } } if ($m == 1) { $word_iterators[$i] = $tmp_word_iterators[0]; @@ -1780,9 +1609,11 @@ class PhraseModel extends ParallelModel $num_disallow_keys = count($disallow_keys); if ($num_disallow_keys > 0) { for ($i = 0; $i < $num_disallow_keys; $i++) { + /* notice for now shift always 0 - you can't disallow + phrases */ $disallow_iterator = - new I\WordIterator($disallow_keys[$i], $index_name, - false, $filter); + new I\WordIterator($disallow_keys[$i], 0, + $index_name, false, $filter); $word_iterators[$num_word_keys + $i] = new I\NegationIterator($disallow_iterator); } diff --git a/src/scripts/suggest.js b/src/scripts/suggest.js index 6c19dd848..9fa3abcbe 100644 --- a/src/scripts/suggest.js +++ b/src/scripts/suggest.js @@ -693,12 +693,9 @@ function spellCheck() } if (referenceNode) { var corrected_spell = elt("spell-check"); - var thesaurus_results = elt("thesaurus-results"); /* corrected_spell might not be present if WORD_SUGGEST off - If there are already thesaurus results we don't want to - clutter the top area so also don't suggest */ - if (!corrected_spell || thesaurus_results) {return; } + if (!corrected_spell) {return; } var logged_in = elt("csrf-token"); if (logged_in) { var csrf_token = elt("csrf-token").value; diff --git a/src/views/SearchView.php b/src/views/SearchView.php index c443c9f5a..b0ea7cd49 100755 --- a/src/views/SearchView.php +++ b/src/views/SearchView.php @@ -211,39 +211,15 @@ class SearchView extends View implements CrawlConstants <?php } ?> <div class="serp-body" > - <?php - $similar_words = $data['THESAURUS_VARIANTS']; - $use_thesaurus = C\WORD_SUGGEST && count($similar_words) > 0 && - !$_SERVER["MOBILE"]; - if ($use_thesaurus) { ?> - <div id="thesaurus-results" class="thesaurus"> - <?php - e(tl('search_view_thesaurus_results')); - foreach ($similar_words as $word) { - e("<br />"); - ?><span><a href="?<?= $token_string_amp - ?>its=<?= $data['its'] ?>&q=<?=$word ?>"><?= - $word ?></a></span> - <?php - } - ?> - </div> - <?php - } - if ($use_thesaurus) { ?> - <div class="thesaurus-serp-results"> <?php - } else { ?> - <div class="serp-results"> - <?php - } + <div class="serp-results"><?php if (!$is_landing) { $this->element("displayadvertisement")->render($data); } - if (!empty($data['BEST_ANSWER'])) { ?> - <div id="best-answer" class="echo-link"> + if (!empty($data['BEST_ANSWER'])) { + ?><div id="best-answer" class="echo-link"> <?= $data['BEST_ANSWER'] ?> - </div> - <?php } + </div><?php + } foreach ($data['PAGES'] as $page) { if (isset($page[self::URL])) { if (substr($page[self::URL], 0, 4) == "url|") { @@ -292,8 +268,8 @@ class SearchView extends View implements CrawlConstants $image_subsearch); e( "</div>"); continue; - } else if (isset($page['NEWS'])) { - $this->helper("feeds")->render($page['NEWS'], + } else if (isset($page['FEED'])) { + $this->helper("feeds")->render($page['FEED'], $token, $data['QUERY'], $subsearch, $data['OPEN_IN_TABS']); e( "</div>"); @@ -437,12 +413,6 @@ class SearchView extends View implements CrawlConstants number_format($page[self::RELEVANCE], 2) )."\n"); e(tl('search_view_proximity', number_format($page[self::PROXIMITY], 2) )."\n"); - if (isset($page[self::THESAURUS_SCORE]) && - $page[self::THESAURUS_SCORE] > 0) { - e(tl('search_view_thesaurus_score', - number_format($page[self::THESAURUS_SCORE], 2)) . - "\n"); - } if (isset($page[self::USER_RANKS])) { foreach ($page[self::USER_RANKS] as $label => $score) { e($label.":".number_format($score/6553.6, 2)."\n"); diff --git a/tests/IndexShardTest.php b/tests/IndexShardTest.php index a1bc2abbc..22d6f758e 100644 --- a/tests/IndexShardTest.php +++ b/tests/IndexShardTest.php @@ -334,7 +334,7 @@ class IndexShardTest extends UnitTest $meta_ids = ["EEEEEEEE", "FFFFFFFF"]; //test saving and loading to a file $this->test_objects['shard']->addDocumentWords($docid, - $offset, $word_counts, $meta_ids, [], true); + $offset, $word_counts, $meta_ids, true); $this->test_objects['shard']->save(); $this->test_objects['shard2'] = IndexShard::load(C\WORK_DIRECTORY. "/shard.txt");