Web Mining Übung 5

This commit is contained in:
Victor-Philipp Negoescu 2013-06-28 17:43:42 +02:00
parent 80ca6a411c
commit 7292d5e935
340 changed files with 27884 additions and 0 deletions

View File

@ -0,0 +1,26 @@
<?php
include 'functions.inc.php';
$HTML_DIR = 'articles_html/';
clearDir('articles_text/');
$handle = opendir($HTML_DIR);
while (false !== ($file = readdir($handle))) {
if ($file == '.' || $file == '..')
continue;
echo "Converting $file from HTML to plaintext...\n";
$html = file_get_contents($HTML_DIR . $file);
$words = htmlToWordList($html);
#if ($file == 'Artificial_intelligence') {
# $words = htmlToWordList($html, true);
# print_r($words);
# exit;
#}
file_put_contents("articles_text/".$file, implode(' ', $words));
}
?>

View File

@ -0,0 +1,94 @@
<?php
include 'functions.inc.php';
function fixChars($str) {
$str = urldecode($str);
$str = str_replace('_', ' ', $str);
$str = preg_replace('/[^\w+]/', ' ', $str);
$str = preg_replace('/\s+/', ' ', $str);
$str = str_replace(' ', '_', $str);
return $str;
}
require_once 'iwa/include_me.inc.php';
$iwa = new IntelligentWebAccess();
clearDir('articles_html/');
$HOST = "en.wikipedia.org";
$BASE_URL = "http://$HOST/wiki/";
$articlesToCrawl = array('Category:Data_mining'); # list of articles to be crawled
#$articlesToCrawl = array('Statistics'); # list of articles to be crawled
$crawledArticles = array(); # Definition of already crawled articles
$linkedArticles = array(); # Definition of linked articles
# Crawl until we have enough articles crawled OR there are
# no articles left to be crawled (breadth-first search)
while (count($crawledArticles) < 100 && count($articlesToCrawl) > 0) {
$article = array_shift($articlesToCrawl);
echo "Current article: ".$article."\n";
$content = $iwa->get($BASE_URL . urlencode($article), array(), array(CURLOPT_FOLLOWLOCATION => true));
$content = preg_replace("/[\r\n]/", " ", $content);
#$dom = str_get_html($content);
#$dom = file_get_html($BASE_URL . urlencode($article));
#if ($dom == null)
# continue;
preg_match_all('/href="(.+?)"/', $content, $hrefMatches);
if (count($hrefMatches[1]) <= 0)
continue;
file_put_contents("articles_html/".fixChars($article), $content);
# Find all links to other articles
$nextArticles = array();
foreach ($hrefMatches[1] as $href) {
if (preg_match("/^\/wiki\/([^:]+)$/", $href, $matches)) {
if ($matches[1] != 'Main_Page') # only crawl non-Main_Page articles
$nextArticles[] = $matches[1];
# remember the link to this article
if (! isset($linkedArticles[$article]))
$linkedArticles[$article] = array();
array_push($linkedArticles[$article], $matches[1]);
}
} # end foreach link
# Add current article to crawled articles
$crawledArticles[] = $article;
# Add linked articles to articles which should be crawled
$articlesToCrawl = array_diff(array_unique(array_merge($articlesToCrawl, $nextArticles)), $crawledArticles);
#print_r($articlesToCrawl);
#sleep(1);
} # end while
# Finally, delete all article links of uncrawled articles and save the
# graph as a DOT formatted file
$dotGraph = "digraph {\n";
foreach ($linkedArticles as $fromArticle => $toArticles) {
if (strpos($fromArticle, ':') !== false)
continue; # Skip special pages (entry point)
$fromArticle = fixChars($fromArticle);
$i = 0;
while ($i < count($toArticles)) {
if (! in_array($toArticles[$i], $crawledArticles))
array_splice($toArticles, $i, 1);
else
$i++;
}
foreach (array_unique($toArticles) as $toArticle) {
$toArticle = fixChars($toArticle);
if ($toArticle != $fromArticle) # ignore self-referal links
$dotGraph .= "\t".$fromArticle." -> ".$toArticle."\n";
}
}
$dotGraph .= "}\n";
file_put_contents("wikigraph.dot", $dotGraph);
?>

View File

@ -0,0 +1,11 @@
<?php
include 'functions.inc.php';
#foreach ($articles as $article)
# echo $article->name()."\n";
#$query = array('machine', 'learning');
#$query = array('frequent', 'itemsets');
$query = array('web', 'mining');
doQuery($query, true, true);
?>

View File

@ -0,0 +1,28 @@
<?php
include 'functions.inc.php';
$articles = loadDotGraph('wikigraph.dot');
$totalHITS = 0;
$totalPR = 0;
$numArticles = 0;
foreach ($articles as $article) {
$query = explode('_', strtolower($article->name()));
$ranking = doQuery($query, false);
#print_r($ranking);
$hitsRank = array_search($article->name(), $ranking['HITS']);
$prRank = array_search($article->name(), $ranking['PageRank']);
if ($hitsRank === false || $prRank === false)
continue;
$numArticles++;
$totalHITS += $hitsRank;
$totalPR += $prRank;
echo "HITS: ".$hitsRank." PageRank: ".$prRank."\n";
}
echo "Avg. Rankings - HITS: ".($totalHITS / $numArticles)." PageRank: ".($totalPR / $numArticles)."\n";
?>

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show More