tf-idfを用いてWEBページの特徴語を抽出する

tf-idf

まぁ、やってることはこちらでやってることと何一つ変わらない。
ただ、対象テキストがWEBページってだけ。
なので内容については上記リンク先で。

http://ja.wikipedia.org/wiki/Tf-idfを引数にしてテスト

テスト結果
Array
(
    [ノート] => 70.550270786977
    [idf] => 58.242566281241
    [トーク] => 39.260016066399
    [collapsiblenav] => 37.198972183809
    [editwarning] => 37.116381689581
    [wgNotice] => 36.981332921722
    [tf] => 29.857403204274
    [単語] => 27.169254869112
    [simplesearch] => 26.690732627211
    [runOnloadHook] => 25.982670229996
    [false] => 23.308651699028
    [vector] => 21.06564049106
    [ドキュメント] => 20.848558681228
    [アルゴリズム] => 19.467134832372
    [出現] => 19.060888352292
    [wikipedia] => 18.973873319242
    [mediawiki] => 18.968250771126
    [ページ] => 17.851484108862
    [利用] => 17.851484108862
    [ファイル] => 16.863049624051
    [言語] => 16.732703103715
    [頻度] => 15.989424100918
    [true] => 15.620048595254
    [表示] => 15.620048595254
    [検索] => 15.620048595254
    [Tf] => 14.928701602137
    [ja] => 14.348497840204
    [disablesuggest] => 14.32633618173
    [isMSIE] => 13.825560893818
    [Wikipedia] => 13.628525681705
    [wgWikimediaMobileUrl] => 13.580438435785
    [expandablesearch] => 13.497056826846
    [wgVectorPreferences] => 13.457836113692
    [wgVectorEnabledModules] => 13.457836113692
    [footercleanup] => 13.454345708753
    [collapsibletabs] => 13.450867444376
    [wgCategories] => 13.443947001532
    [wgNamespaceIds] => 13.426852568172
    [wgSearchNamespaces] => 13.426852568172
    [wgMWSuggestMessages] => 13.423468470188
    [wgMainPageTitle] => 13.42009578571
    [wgFormattedNamespaces] => 13.416734438007
    [wgMWSuggestTemplate] => 13.416734438007
    [wgDigitTransformTable] => 13.413384351122
    [wgSeparatorTransformTable] => 13.413384351122
    [wgRestrictionMove] => 13.410045449856
    [wgRestrictionEdit] => 13.410045449856
    [wgActionPaths] => 13.39024282256
    [wgVariantArticlePath] => 13.39024282256
    [wgNoticeLocal] => 13.39024282256
    [w] => 13.388613081647
    [http] => 13.388613081647
    [wgBreakFrames] => 13.386980176925
    [wgCurRevisionId] => 13.383728141539
    [wgEnableWriteAPI] => 13.383728141539
    [wgEnableAPI] => 13.377255627033
    [wgContentLanguage] => 13.374035012333
    [wgUserLanguage] => 13.367624733972
    [wgIsArticle] => 13.364434938604
    [wgUserGroups] => 13.364434938604
    [wgArticleId] => 13.361255285687
    [wgCanonicalNamespace] => 13.345506928719
    [wgSiteName] => 13.339276378968
    [wgUserName] => 13.330002742183
    [wgNamespaceNumber] => 13.302686931536
    [wgPageName] => 13.27901718745
    [MediaWiki] => 12.651079318259
    [wgVersion] => 12.611037878597
    [wgDBname] => 12.608043864384
    [wgAction] => 12.508530003564
    [wgScriptExtension] => 12.508530003564
    [wgArticlePath] => 12.392402223722
    [wgScriptPath] => 12.367415795939
    [addMessages] => 12.281874358946
    [wgTitle] => 12.254213020858
    [wgUrlProtocols] => 12.234472120052
    [wgServer] => 12.119894949289
    [wgScript] => 12.023751088736
    [jawiki] => 11.699255043162
    [fixalpha] => 11.612745800252
    [org] => 11.157177568039
    [者] => 11.157177568039
    [stylepath] => 11.052341057641
    [失わ] => 10.85799949723
    [writeln] => 8.9992694018962
    [ヘルプ] => 8.925742054431
    [searchTerms] => 8.804875263868
    [worldwind] => 8.7156441301401
    [テンプレート] => 8.6324777882718
    [ウィキペディア] => 8.415335741836
    [enable] => 8.3906442444265
    [null] => 8.3209687294533
    [候補] => 8.2578720139834
    [語] => 8.2001040272595
    [みなさ] => 8.0805524658396
    [特別] => 7.9141734878465
    [Frequency] => 7.7122307469797
    [opensearch] => 7.6930177484499
    [会話] => 7.5272060006197
    [重要] => 7.3255414809827
    [nntp] => 7.1821921246839
    [編集] => 6.9077552789821
    [wiki] => 6.9077552789821
    [処理] => 6.8684744714807
    [gopher] => 6.8589651148127
    [カテゴリ] => 6.6943065408233
    [用] => 6.6943065408233
    [php] => 6.6943065408233
    [更新] => 6.6943065408233
    [x] => 6.6943065408233
    [if] => 6.6943065408233
    [設定] => 6.6943065408233
    [個人] => 6.6943065408233
    [portal] => 6.6943065408233
    [ウィキメディア・コモンズ] => 6.6608952010506
    [ツール] => 6.6099570478432
    [変種] => 6.5929445391421
    [項目] => 6.4730615202138
    [wmf] => 6.3683421983642
    [メディア] => 6.269988681775
    [離れる] => 6.2297217362322
    [メイン] => 6.2222862501306
    [行っ] => 6.2021855784236
    [井戸端] => 6.1610673314942
    [クリエイティブ・コモンズ] => 6.1422874368426
    [要約] => 6.1057536935101
    [telnet] => 6.0469772570382
    [template] => 5.9954685524493
    [Template] => 5.9934655477751
    [果たす] => 5.92567680657
    [タブ] => 5.6365912251159
    [語句] => 5.6296030764819
    [namespace] => 5.6034847533408
    [下がり] => 5.5016582905661
    [Inverse] => 5.4103668703567
    [svn] => 5.3506638469202
    [出典] => 5.3144467484779
    [一種] => 5.2883670356949
    [潜在] => 5.2785147392519
    [継承] => 5.2687585643065
    [寄付] => 5.212139670307
    [mailto] => 5.1939573512238
    [バグ] => 5.1585554241729
    [抽出] => 5.1243640594246
    [コミュニティ・ポータル] => 5.0436751481745
    [指標] => 4.954727662158
    [namespaces] => 4.9476604949349
    [mw] => 4.8730496311437
    [usability] => 4.8472417471878
    [自然] => 4.8158912173037
    [フィルタ] => 4.733003557498
    [window] => 4.6764245412744
    [役割] => 4.6407973636312
    [ライセンス] => 4.5516294190601
    [事典] => 4.5421953868267
    [ナビゲーション] => 4.528209144852
    [二つ] => 4.528209144852
    [おまかせ] => 4.4785375350547
    [文章] => 4.4628710272155
    [中] => 4.4628710272155
    [画像] => 4.4628710272155
    [的] => 4.4628710272155
    [help] => 4.4628710272155
    [ログイン] => 4.4628710272155
    [場合] => 4.4628710272155
    [i] => 4.4628710272155
    [category] => 4.4628710272155
    [数] => 4.4628710272155
    [作成] => 4.4628710272155
    [最近] => 4.4628710272155
    [リンク] => 4.4628710272155
    [関連] => 4.4628710272155
    [search] => 4.4628710272155
    [image] => 4.4628710272155
    [一般] => 4.4628710272155
    [他] => 4.4628710272155
    [度] => 4.4628710272155
    [Category] => 4.4628710272155
    [Portal] => 4.4628710272155
    [Help] => 4.4628710272155
    [l] => 4.4628710272155
    [https] => 4.4567501808698
    [働き] => 4.4270240006046
    [警告] => 4.3583101080566
    [irc] => 4.3389671452134
    [上げる] => 4.2097554137334
    [nbsp] => 4.1135273816389
    [分野] => 4.0427013290703
    [wikimedia] => 4.0090847182713
    [ロード] => 3.9685933569165
    [skins] => 3.9606881774094
    [テキスト] => 3.9580669439296
    [bits] => 3.9528449999484
    [出来事] => 3.9450628595063
    [アカウント] => 3.9347799925508
    [適用] => 3.8397023438485
    [計算] => 3.7550192566185
    [注] => 3.7550192566185
    [バージョン] => 3.6989297899674
    [ボックス] => 3.6749765489449
    [ftp] => 3.6690768268178
    [UTC] => 3.6651629274966
    [ポータル] => 3.6287255312942
    [containing] => 3.6193533914653
    [解析] => 3.545645286028
    [var] => 3.4721564706026
    [練習] => 3.4705474597969
    [参照] => 3.4561816899949
    [百科] => 3.4373428282985
    [特徴] => 3.4326880487535
    [固定] => 3.3726099248102
    [全文] => 3.2794223247362
    [移動] => 3.1349943408875
    [含む] => 3.1315519966965
    [日時] => 3.1089009716596
    [空間] => 3.0977649991191
    [印刷] => 3.0565444129507
    [閲覧] => 3.0554822779598
    [Polski] => 2.9977342762247
    [免責] => 2.9469421093846
    [warning] => 2.9469421093846
    [履歴] => 2.9095545773129
    [Русский] => 2.9004220937497
    [しまう] => 2.8735146408297
    [主] => 2.8647040111476
    [文] => 2.8387285247443
    [引用] => 2.8051119139453
    [ni] => 2.7887181041697
    [最終] => 2.7725887222398
    [意味] => 2.7567153730835
    [suggest] => 2.7488721956225
    [多く] => 2.7333680090865
    [フリー] => 2.7333680090865
    [報告] => 2.7181005369557
    [操作] => 2.6521425691639
    [アップ] => 2.5319982573219
    [api] => 2.5010360317179
    [document] => 2.471003744619
    [Document] => 2.471003744619
    [特定] => 2.4418471603276
    [事項] => 2.41911890925
    [状況] => 2.3859667019331
    [条件] => 2.3538783873816
    [変更] => 2.3126354288475
    [Term] => 2.2778924804037
    [skin] => 2.2633643798408
    [Deutsch] => 2.2314355136078
    [版] => 2.2314355136078
    [English] => 2.2314355136078
    [Fran〓ais] => 2.2314355136078
    [年] => 2.2314355136078
    [すべて] => 2.2314355136078
    [一覧] => 2.2314355136078
    [o] => 2.2314355136078
    [g] => 2.2314355136078
    [案内] => 2.2314355136078
    [action] => 2.2314355136078
    [お知らせ] => 2.2314355136078
    [問い合わせ] => 2.2314355136078
    [月] => 2.2314355136078
    [日] => 2.2314355136078
    [more] => 2.2314355136078
    [規約] => 2.2314355136078
    [ください] => 2.2314355136078
    [プライバシー] => 2.2314355136078
    [ポリシー] => 2.2314355136078
    [詳細] => 2.2314355136078
    [m] => 2.2314355136078
    [あなた] => 2.2314355136078
    [木] => 2.2314355136078
    [下] => 2.2314355136078
    [可能] => 2.2314355136078
    [追加] => 2.2314355136078
    [index] => 2.2314355136078
    [view] => 2.2314355136078
    [D] => 2.2314355136078
    [機能] => 2.2314355136078
    [C] => 2.2314355136078
    [news] => 2.2314355136078
    [talk] => 2.2314355136078
    [名前] => 2.2314355136078
    [情報] => 2.2314355136078
    [":] => -INF
    [},] => -INF
    [':'] => -INF
    [apsibleNavForceNewVersion] => -INF
    ["-] => -INF
    [\\:|] => -INF
    [,] => -INF
    [=".] => -INF
    [="",] => -INF
    [:] => -INF
    [;:] => -INF
    [],] => -INF
    [={"-] => -INF
    [);] => -INF
    [gCollapsibleNavBucketTest] => -INF
    ["",] => -INF
    [[] => -INF
    [}\] => -INF
    ["] => -INF
    [)] => -INF
    ["],] => -INF
    [://] => -INF
    [|] => -INF
    [=] => -INF
    [;] => -INF
    [?] => -INF
    [\] => -INF
    ['')] => -INF
    [={"] => -INF
    [!=] => -INF
    [="";] => -INF
    [\\:\\/\\/",] => -INF
    []] => -INF
    [(] => -INF
    [",] => -INF
    ["":] => -INF
    [/] => -INF
    ['});] => -INF
    [="] => -INF
    [&] => -INF
    [=["] => -INF
    [}},] => -INF
    [','] => -INF
    ["},] => -INF
    [&#] => -INF
    [gCanonicalSpecialPageName] => -INF
    [{"] => -INF
    [""],] => -INF
    [={] => -INF
    [();] => -INF
    [=[],] => -INF
    [=["",] => -INF
    [.] => -INF
    [=[] => -INF
    [({'] => -INF
    [/$] => -INF
    [\\:\\/\\/|] => -INF
    [={},] => -INF
    [="/] => -INF
    [_] => -INF
    [-] => -INF
)
--
time:5 sec
  • INFの原因はdf値が0(つまり、Yahooでの検索結果が0または取得できなかった場合)なので、スルー
  • ノート、トークン、その他もろもろの値が高いのは、wikipediaの仕様*1


Webページによってこういう特徴あるだろうから、上限の閾値も設けないといけない。
その辺の判定はシビアかもなー


tfとTfを別物として判定してるので、それぞれの値が低くなってる。
大文字小文字区別しなければここら辺は改善できそう。

curl_multiで並列処理

結果はぼちぼちとして、今回の収穫はcurl_multi

こんだけAPI叩いてるのに、取得に5secしかかかってない。
すごい。

実際は閾値を設ける予定なので、もっと減らせるはず。
せめて3秒以内にしないと使い物にならないなー

APIとか

院試終わったら、サーバ立てる予定なのでそこで動くようにしようかと。
APIとか公開できればいいなー
サーバの余裕あればだけど。

*1:[http://gyazo.com/d7c761505af887e9cb8c7834f32db588.png:image]