source:
trunk/includes/keywords.inc.php
@
594
| Revision 594, 10.7 KB checked in by r2, 18 months ago (diff) | |
|---|---|
|
|
| Line | |
|---|---|
| 1 | <?php |
| 2 | |
| 3 | /****************************************************************** |
| 4 | Projectname: Automatic Keyword Generator |
| 5 | Version: 0.3 |
| 6 | Author: Ver Pangonilo <smp_AT_itsp.info> |
| 7 | Last modified: 26 July 2006 |
| 8 | Copyright (C): 2006 Ver Pangonilo, All Rights Reserved |
| 9 | |
| 10 | * GNU General Public License (Version 2, June 1991) |
| 11 | * |
| 12 | * This program is free software; you can redistribute |
| 13 | * it and/or modify it under the terms of the GNU |
| 14 | * General Public License as published by the Free |
| 15 | * Software Foundation; either version 2 of the License, |
| 16 | * or (at your option) any later version. |
| 17 | * |
| 18 | * This program is distributed in the hope that it will |
| 19 | * be useful, but WITHOUT ANY WARRANTY; without even the |
| 20 | * implied warranty of MERCHANTABILITY or FITNESS FOR A |
| 21 | * PARTICULAR PURPOSE. See the GNU General Public License |
| 22 | * for more details. |
| 23 | |
| 24 | Description: |
| 25 | This class can generates automatically META Keywords for your |
| 26 | web pages based on the contents of your articles. This will |
| 27 | eliminate the tedious process of thinking what will be the best |
| 28 | keywords that suits your article. The basis of the keyword |
| 29 | generation is the number of iterations any word or phrase |
| 30 | occured within an article. |
| 31 | |
| 32 | This automatic keyword generator will create single words, |
| 33 | two word phrase and three word phrases. Single words will be |
| 34 | filtered from a common words list. |
| 35 | |
| 36 | Change Log: |
| 37 | =========== |
| 38 | 0.2 Ver Pangonilo - 22 July 2005 |
| 39 | ================================ |
| 40 | Added user configurable parameters and commented codes |
| 41 | for easier end user understanding. |
| 42 | |
| 43 | 0.3 Vasilich (vasilich_AT_grafin.kiev.ua) - 26 July 2006 |
| 44 | ========================================================= |
| 45 | Added encoding parameter to work with UTF texts, min number |
| 46 | of the word/phrase occurrences, |
| 47 | |
| 48 | ******************************************************************/ |
| 49 | |
| 50 | class autokeyword { |
| 51 | |
| 52 | //declare variables |
| 53 | //the site contents |
| 54 | var $contents; |
| 55 | var $encoding; |
| 56 | //the generated keywords |
| 57 | var $keywords; |
| 58 | //minimum word length for inclusion into the single word |
| 59 | //metakeys |
| 60 | var $wordLengthMin; |
| 61 | var $wordOccuredMin; |
| 62 | //minimum word length for inclusion into the 2 word |
| 63 | //phrase metakeys |
| 64 | var $word2WordPhraseLengthMin; |
| 65 | var $phrase2WordLengthMinOccur; |
| 66 | //minimum word length for inclusion into the 3 word |
| 67 | //phrase metakeys |
| 68 | var $word3WordPhraseLengthMin; |
| 69 | //minimum phrase length for inclusion into the 2 word |
| 70 | //phrase metakeys |
| 71 | var $phrase2WordLengthMin; |
| 72 | var $phrase3WordLengthMinOccur; |
| 73 | //minimum phrase length for inclusion into the 3 word |
| 74 | //phrase metakeys |
| 75 | var $phrase3WordLengthMin; |
| 76 | |
| 77 | function autokeyword($params, $encoding) |
| 78 | { |
| 79 | //get parameters |
| 80 | $this->encoding = $encoding; |
| 81 | if (function_exists('mb_internal_encoding')) mb_internal_encoding($encoding); |
| 82 | $this->contents = $this->replace_chars($params['content']); |
| 83 | |
| 84 | // single word |
| 85 | $this->wordLengthMin = $params['min_word_length']; |
| 86 | $this->wordOccuredMin = $params['min_word_occur']; |
| 87 | |
| 88 | // 2 word phrase |
| 89 | $this->word2WordPhraseLengthMin = $params['min_2words_length']; |
| 90 | $this->phrase2WordLengthMin = $params['min_2words_phrase_length']; |
| 91 | $this->phrase2WordLengthMinOccur = $params['min_2words_phrase_occur']; |
| 92 | |
| 93 | // 3 word phrase |
| 94 | $this->word3WordPhraseLengthMin = $params['min_3words_length']; |
| 95 | $this->phrase3WordLengthMin = $params['min_3words_phrase_length']; |
| 96 | $this->phrase3WordLengthMinOccur = $params['min_3words_phrase_occur']; |
| 97 | |
| 98 | //parse single, two words and three words |
| 99 | |
| 100 | } |
| 101 | |
| 102 | function get_keywords() |
| 103 | { |
| 104 | $keywords = $this->parse_words().$this->parse_2words().$this->parse_3words(); |
| 105 | return substr($keywords, 0, -2); |
| 106 | } |
| 107 | |
| 108 | //turn the site contents into an array |
| 109 | //then replace common html tags. |
| 110 | function replace_chars($content) |
| 111 | { |
| 112 | //convert all characters to lower case |
| 113 | $content = mb_strtolower($content); |
| 114 | //$content = mb_strtolower($content, "UTF-8"); |
| 115 | $content = strip_tags($content); |
| 116 | |
| 117 | $punctuations = array(',', ')', '(', '.', "'", '"', |
| 118 | '<', '>', '!', '?', '/', '-', |
| 119 | '_', '[', ']', ':', '+', '=', '#', |
| 120 | '$', '"', '©', '>', '<', '«', '«', '»', ';', |
| 121 | chr(10), chr(13), chr(9)); |
| 122 | |
| 123 | $content = str_replace($punctuations, " ", $content); |
| 124 | // replace multiple gaps |
| 125 | $content = preg_replace('/ {2,}/si', " ", $content); |
| 126 | |
| 127 | return $content; |
| 128 | } |
| 129 | |
| 130 | //single words META KEYWORDS |
| 131 | function parse_words() |
| 132 | { |
| 133 | //list of commonly used words |
| 134 | // this can be edited to suit your needs |
| 135 | $common = array("ñïîñîáíûé", "îêîëî", "ñâåðõó", "àêò", "äîáàâèòü", "áîþñü", "ïîñëå", "ðàç", "ïðîòèâ", "âîçðàñò", "ñâîåé", "ñîãëàñåí", "âñå", "ïî÷òè", "òîëüêî", "âìåñòå", "óæå", "òàêæå", "õîòÿ", "âñåãäà", "ïpåæäå", "ñóììà", "è", "ãíåâ", "çëûå", "äðóãîé", "îòâåò", "ëþáîé", "ïîÿâëÿþòñÿ", "êîòîpîìó", "åñòü", "ïðèáûëè", "ðóêà", "îðóæèå", "îêîëî", "ñïðîñèòü", "ïîïûòêà", "òåòÿ", "â ñòîðîíó", "Íàçàä ", "ïëîõîé", "ìåøîê", "çàëèâ", "áûòü", "ñòàë", "ïîòîìó ÷òî", "ñòàòü", "áûë", "ïåðåä", "íà÷àë", "íà÷àòü", "áûòèå", "÷òîáû", "ïðèíàäëåæàòü", "íèæå", "ðÿäîì", "ëó÷øèé", "ìåæäó", "áîëüøîé", "òåëî", "êîñòü", "ðîä", "áðàòü", "äíî", "êîòîpîå", "ìàëü÷èê", "ïåðåðûâ", "ïðèíåñòè", "ïðèíåñ", "îøèáêà", "ïîñòðîèë", "çàíÿòî", "íî", "êóïèòü", "âûçîâ", "ïðèøëè", "ìîæåò", "ïðè÷èíà", "âûáðàòü", "çàêðûòü", "ðàññìîòðåòü", "ïðèéòè", "ðàññìîòðåòü", "ñíîâà", "ñîäåðæàòü", "ïðîäîëæàòü", "ýòîãî", "âûðåçàòü", "ñìåë", "òåìíûé", "äåëî", "ìèëàÿ", "ðåøèòü", "ãëóáîêèé", "ñäåëàë", "óìåðåòü", "äåëàòü", "íå", "ñäåëàëè", "ñîìíåíèå", "âíèç", "âî âðåìÿ", "íè÷åãî", "ðàííåå", "óñèëèå", "ëèáî", "äðóãîãî", "êîíåö", "ïîëüçîâàòüñÿ", "äîñòàòî÷íî", "Enter", "äàæå", "íèêîãäà", "êàæäûé", "êðîìå", "îæèäàòü", "îáúÿñíèòü", "ïàäåíèå", "äàëåêî", "òîëñòûé", "çà", "ñòðàõ", "÷óâñòâîâàòü", "íîãè", "óïàë", "÷óâñòâîâàë", "íåêîòîðûå", "çàïîëíèòü", "êîòîpûå", "âïèñûâàåòñÿ", "ëåòàòü", "ñëåäèòü", "íàâñåãäà", "çàáûëè", "îò", "õîpîøî", "äàë", "ïîëó÷èòü", "äàåò", "èäåò", "íåò", "õîðîøî", "ïîëó÷èë", "ñâîåãî", "âåëèêèé", "êîòîpàÿ", "âûðîñ", "ðàñòè", "óãàäàòü", "ïîëîâèíà", "ïîâåñèòü", "ñëó÷èëîñü", "åñòü", "øëÿïà", "èìåòü", "îí", "óñëûøàòü", "ñëûøàëè", "ñîñòîÿëàñü", "ïðèâåò", "ïîìîùü", "çäåñü", "âûñîêàÿ", "òîãäà", "òàêîé", "äåðæàòü", "ãîðÿ÷èé", "îäíàêî", "åñëè", "ïëîõî", "äåéñòâèòåëüíî", "ñâîèì", "åñòü", "åãî", "åå", "ñïðàâåäëèâûé", "äåðæàòü", "çíàë", "çíàòü", "èçâåñòíûì", "ïîçäíî", "êàê ìèíèìóì", "ïðèâåëè", "ñëåâà", "îäîëæèòü", "ìåíüøå", "ïóñòü", "êàê", "âåðîÿòíî", "îäèíîêèé", "äëèííûé", "ñìîòðåòü", "äåëàòü", "ìíîãî" , "âîçìîæíî", "ñðåäíÿÿ", "âñòðåòèëèñü", "âåðîÿòíî", "íèêàêèå", "ñâîåìó", "áîëüøå", "ñàìûé", "äâèãàòüñÿ", "äîëæåí", "ìîå", "ðÿäîì", "ïî÷òè", "íåîáõîäèìî", "íè", "íèêîãäà", "ñëåäóþùèé", "ê ñâåäåíèþ", "íè÷òî", "ñåé÷àñ", "íîìåð", "î", "âûêëþ÷åíî", "çà÷àñòóþ", "àõ", "êîòîpûé", "îäèí ðàç", "èëè", "äðóãèå", "íàø", "âíå", "ïîæàëóéñòà", "ïîäãîòîâèòü", "âåðîÿòíî", "òÿíóòü", "÷èñòûé", "òîë÷îê", "ïóòü", "ïîäíèìàòü", "áåæàë", "äîñòèæåíèÿ", "ïîíèìàòü", "îòâåòèòü", "òðåáîâàòü", "ñîáîé", "áåæàòü", "ñêàçàë", "òî æå", "âèäåë", "ãîâîðèòü", "âèäåòü", "êàæåòñÿ", "ïðîäàòü", "ïîñëàë", "îòäåëüíûé", "ìíîæåñòâî", "îíà", "ñòîðîíà", "çíàê", "òàê", "ïðîäàë", "íåêîòîðûå", "ñêîðî", "èçâèíèòå", "îñòàíîâèòüñÿ", "âñåãî", "ïàëêà", "äî ñèõ ïîð", "ñòîÿëè", "òàêèå", "âíåçàïíàÿ", "Ïðåäïîëîæèì", "âçÿòü", "áðàòü", "âàøåãî", "âûñîêèé", "ñêàçàòü", "÷åì", "ñïàñèáî", "÷òî", "ñâîèõ", "èõ", "çàòåì", "òàì", "ïîýòîìó", "ýòè", "îíè", "ýòî", "òå", "õîòÿ", "ïîñðåäñòâîì", "äî", "ê", "ñåãîäíÿ", "ñêàçàë", "çàâòðà", "ñëèøêîì", "âçÿë", "ñîðâàë", "íàó÷èëè", "ïûòàëñÿ", "ïûòàåòñÿ", "äîâåðèå", "ïîïðîáîâàòü", "î÷åðåäü", "ïîä", "ïîêà", "ââåðõ", "ïîñëå", "íàñ", "èñïîëüçîâàíèå", "îáû÷íûé", "ðàçíûå", "äîëæíû", "î÷åíü", "ñàìîé", "õî÷ó", "ìû", "õîðîøî", "ïîøåë", "áûëè", "êîãäà", "ãäå", "áóäü òî", "êîòîðûé", "à", "êàêîé", "êòî", "êîãî", "÷üÿ", "ïî÷åìó", "âîëÿ", "ñ", "â", "áåç", "áóäåò", "äà", "òû", "ìîëîäîé", "Âàø", "br", "img", "p","lt", "gt", "quot", "copy", "«"); |
| 136 | //create an array out of the site contents |
| 137 | $s = explode(" ", $this->contents); |
| 138 | //initialize array |
| 139 | $k = array(); |
| 140 | //iterate inside the array |
| 141 | foreach( $s as $key=>$val ) { |
| 142 | //delete single or two letter words and |
| 143 | //Add it to the list if the word is not |
| 144 | //contained in the common words list. |
| 145 | if(mb_strlen(trim($val)) >= $this->wordLengthMin && !in_array(trim($val), $common) && !is_numeric(trim($val))) { |
| 146 | $k[] = trim($val); |
| 147 | } |
| 148 | } |
| 149 | //count the words |
| 150 | $k = array_count_values($k); |
| 151 | //sort the words from |
| 152 | //highest count to the |
| 153 | //lowest. |
| 154 | $occur_filtered = $this->occure_filter($k, $this->wordOccuredMin); |
| 155 | arsort($occur_filtered); |
| 156 | |
| 157 | $imploded = $this->implode(", ", $occur_filtered); |
| 158 | //release unused variables |
| 159 | unset($k); |
| 160 | unset($s); |
| 161 | |
| 162 | return $imploded; |
| 163 | } |
| 164 | |
| 165 | function parse_2words() |
| 166 | { |
| 167 | //create an array out of the site contents |
| 168 | $x = explode(" ", $this->contents); |
| 169 | //initilize array |
| 170 | |
| 171 | //$y = array(); |
| 172 | for ($i=0; $i < count($x)-1; $i++) { |
| 173 | //delete phrases lesser than 5 characters |
| 174 | if( (mb_strlen(trim($x[$i])) >= $this->word2WordPhraseLengthMin ) && (mb_strlen(trim($x[$i+1])) >= $this->word2WordPhraseLengthMin) ) |
| 175 | { |
| 176 | $y[] = trim($x[$i])." ".trim($x[$i+1]); |
| 177 | } |
| 178 | } |
| 179 | |
| 180 | //count the 2 word phrases |
| 181 | $y = array_count_values($y); |
| 182 | |
| 183 | $occur_filtered = $this->occure_filter($y, $this->phrase2WordLengthMinOccur); |
| 184 | //sort the words from highest count to the lowest. |
| 185 | arsort($occur_filtered); |
| 186 | |
| 187 | $imploded = $this->implode(", ", $occur_filtered); |
| 188 | //release unused variables |
| 189 | unset($y); |
| 190 | unset($x); |
| 191 | |
| 192 | return $imploded; |
| 193 | } |
| 194 | |
| 195 | function parse_3words() |
| 196 | { |
| 197 | //create an array out of the site contents |
| 198 | $a = explode(" ", $this->contents); |
| 199 | //initilize array |
| 200 | $b = array(); |
| 201 | |
| 202 | for ($i=0; $i < count($a)-2; $i++) { |
| 203 | //delete phrases lesser than 5 characters |
| 204 | if( (mb_strlen(trim($a[$i])) >= $this->word3WordPhraseLengthMin) && (mb_strlen(trim($a[$i+1])) > $this->word3WordPhraseLengthMin) && (mb_strlen(trim($a[$i+2])) > $this->word3WordPhraseLengthMin) && (mb_strlen(trim($a[$i]).trim($a[$i+1]).trim($a[$i+2])) > $this->phrase3WordLengthMin) ) |
| 205 | { |
| 206 | $b[] = trim($a[$i])." ".trim($a[$i+1])." ".trim($a[$i+2]); |
| 207 | } |
| 208 | } |
| 209 | |
| 210 | //count the 3 word phrases |
| 211 | $b = array_count_values($b); |
| 212 | //sort the words from |
| 213 | //highest count to the |
| 214 | //lowest. |
| 215 | $occur_filtered = $this->occure_filter($b, $this->phrase3WordLengthMinOccur); |
| 216 | arsort($occur_filtered); |
| 217 | |
| 218 | $imploded = $this->implode(", ", $occur_filtered); |
| 219 | //release unused variables |
| 220 | unset($a); |
| 221 | unset($b); |
| 222 | |
| 223 | return $imploded; |
| 224 | } |
| 225 | |
| 226 | function occure_filter($array_count_values, $min_occur) |
| 227 | { |
| 228 | $occur_filtered = array(); |
| 229 | foreach ($array_count_values as $word => $occured) { |
| 230 | if ($occured >= $min_occur) { |
| 231 | $occur_filtered[$word] = $occured; |
| 232 | } |
| 233 | } |
| 234 | |
| 235 | return $occur_filtered; |
| 236 | } |
| 237 | |
| 238 | function implode($gule, $array) |
| 239 | { |
| 240 | $c = ""; |
| 241 | foreach($array as $key=>$val) { |
| 242 | @$c .= $key.$gule; |
| 243 | } |
| 244 | return $c; |
| 245 | } |
| 246 | } |
| 247 | ?> |
Note: See TracBrowser
for help on using the repository browser.
