Web Mining Übung 5
This commit is contained in:
parent
80ca6a411c
commit
7292d5e935
26
ss2013/1_Web Mining/Uebungen/5_Uebung/1_htmlToText.php
Normal file
26
ss2013/1_Web Mining/Uebungen/5_Uebung/1_htmlToText.php
Normal file
@ -0,0 +1,26 @@
|
||||
<?php
|
||||
include 'functions.inc.php';
|
||||
|
||||
$HTML_DIR = 'articles_html/';
|
||||
|
||||
clearDir('articles_text/');
|
||||
$handle = opendir($HTML_DIR);
|
||||
|
||||
while (false !== ($file = readdir($handle))) {
|
||||
if ($file == '.' || $file == '..')
|
||||
continue;
|
||||
|
||||
|
||||
echo "Converting $file from HTML to plaintext...\n";
|
||||
$html = file_get_contents($HTML_DIR . $file);
|
||||
|
||||
$words = htmlToWordList($html);
|
||||
#if ($file == 'Artificial_intelligence') {
|
||||
# $words = htmlToWordList($html, true);
|
||||
# print_r($words);
|
||||
# exit;
|
||||
#}
|
||||
file_put_contents("articles_text/".$file, implode(' ', $words));
|
||||
}
|
||||
|
||||
?>
|
||||
94
ss2013/1_Web Mining/Uebungen/5_Uebung/1_wikiCrawler.php
Normal file
94
ss2013/1_Web Mining/Uebungen/5_Uebung/1_wikiCrawler.php
Normal file
@ -0,0 +1,94 @@
|
||||
<?php
|
||||
include 'functions.inc.php';
|
||||
|
||||
function fixChars($str) {
|
||||
$str = urldecode($str);
|
||||
$str = str_replace('_', ' ', $str);
|
||||
$str = preg_replace('/[^\w+]/', ' ', $str);
|
||||
$str = preg_replace('/\s+/', ' ', $str);
|
||||
$str = str_replace(' ', '_', $str);
|
||||
return $str;
|
||||
}
|
||||
|
||||
require_once 'iwa/include_me.inc.php';
|
||||
$iwa = new IntelligentWebAccess();
|
||||
|
||||
clearDir('articles_html/');
|
||||
|
||||
$HOST = "en.wikipedia.org";
|
||||
$BASE_URL = "http://$HOST/wiki/";
|
||||
|
||||
$articlesToCrawl = array('Category:Data_mining'); # list of articles to be crawled
|
||||
#$articlesToCrawl = array('Statistics'); # list of articles to be crawled
|
||||
$crawledArticles = array(); # Definition of already crawled articles
|
||||
$linkedArticles = array(); # Definition of linked articles
|
||||
|
||||
# Crawl until we have enough articles crawled OR there are
|
||||
# no articles left to be crawled (breadth-first search)
|
||||
while (count($crawledArticles) < 100 && count($articlesToCrawl) > 0) {
|
||||
$article = array_shift($articlesToCrawl);
|
||||
|
||||
echo "Current article: ".$article."\n";
|
||||
$content = $iwa->get($BASE_URL . urlencode($article), array(), array(CURLOPT_FOLLOWLOCATION => true));
|
||||
$content = preg_replace("/[\r\n]/", " ", $content);
|
||||
#$dom = str_get_html($content);
|
||||
#$dom = file_get_html($BASE_URL . urlencode($article));
|
||||
|
||||
#if ($dom == null)
|
||||
# continue;
|
||||
preg_match_all('/href="(.+?)"/', $content, $hrefMatches);
|
||||
if (count($hrefMatches[1]) <= 0)
|
||||
continue;
|
||||
|
||||
file_put_contents("articles_html/".fixChars($article), $content);
|
||||
|
||||
# Find all links to other articles
|
||||
$nextArticles = array();
|
||||
foreach ($hrefMatches[1] as $href) {
|
||||
if (preg_match("/^\/wiki\/([^:]+)$/", $href, $matches)) {
|
||||
if ($matches[1] != 'Main_Page') # only crawl non-Main_Page articles
|
||||
$nextArticles[] = $matches[1];
|
||||
|
||||
# remember the link to this article
|
||||
if (! isset($linkedArticles[$article]))
|
||||
$linkedArticles[$article] = array();
|
||||
array_push($linkedArticles[$article], $matches[1]);
|
||||
}
|
||||
} # end foreach link
|
||||
|
||||
# Add current article to crawled articles
|
||||
$crawledArticles[] = $article;
|
||||
# Add linked articles to articles which should be crawled
|
||||
$articlesToCrawl = array_diff(array_unique(array_merge($articlesToCrawl, $nextArticles)), $crawledArticles);
|
||||
#print_r($articlesToCrawl);
|
||||
|
||||
#sleep(1);
|
||||
} # end while
|
||||
|
||||
# Finally, delete all article links of uncrawled articles and save the
|
||||
# graph as a DOT formatted file
|
||||
$dotGraph = "digraph {\n";
|
||||
foreach ($linkedArticles as $fromArticle => $toArticles) {
|
||||
if (strpos($fromArticle, ':') !== false)
|
||||
continue; # Skip special pages (entry point)
|
||||
|
||||
$fromArticle = fixChars($fromArticle);
|
||||
|
||||
$i = 0;
|
||||
while ($i < count($toArticles)) {
|
||||
if (! in_array($toArticles[$i], $crawledArticles))
|
||||
array_splice($toArticles, $i, 1);
|
||||
else
|
||||
$i++;
|
||||
}
|
||||
|
||||
foreach (array_unique($toArticles) as $toArticle) {
|
||||
$toArticle = fixChars($toArticle);
|
||||
if ($toArticle != $fromArticle) # ignore self-referal links
|
||||
$dotGraph .= "\t".$fromArticle." -> ".$toArticle."\n";
|
||||
}
|
||||
}
|
||||
$dotGraph .= "}\n";
|
||||
|
||||
file_put_contents("wikigraph.dot", $dotGraph);
|
||||
?>
|
||||
11
ss2013/1_Web Mining/Uebungen/5_Uebung/2_1-3.php
Normal file
11
ss2013/1_Web Mining/Uebungen/5_Uebung/2_1-3.php
Normal file
@ -0,0 +1,11 @@
|
||||
<?php
|
||||
include 'functions.inc.php';
|
||||
|
||||
#foreach ($articles as $article)
|
||||
# echo $article->name()."\n";
|
||||
|
||||
#$query = array('machine', 'learning');
|
||||
#$query = array('frequent', 'itemsets');
|
||||
$query = array('web', 'mining');
|
||||
doQuery($query, true, true);
|
||||
?>
|
||||
28
ss2013/1_Web Mining/Uebungen/5_Uebung/2_4.php
Normal file
28
ss2013/1_Web Mining/Uebungen/5_Uebung/2_4.php
Normal file
@ -0,0 +1,28 @@
|
||||
<?php
|
||||
include 'functions.inc.php';
|
||||
|
||||
$articles = loadDotGraph('wikigraph.dot');
|
||||
$totalHITS = 0;
|
||||
$totalPR = 0;
|
||||
$numArticles = 0;
|
||||
foreach ($articles as $article) {
|
||||
$query = explode('_', strtolower($article->name()));
|
||||
|
||||
$ranking = doQuery($query, false);
|
||||
#print_r($ranking);
|
||||
$hitsRank = array_search($article->name(), $ranking['HITS']);
|
||||
$prRank = array_search($article->name(), $ranking['PageRank']);
|
||||
|
||||
if ($hitsRank === false || $prRank === false)
|
||||
continue;
|
||||
|
||||
$numArticles++;
|
||||
|
||||
$totalHITS += $hitsRank;
|
||||
$totalPR += $prRank;
|
||||
echo "HITS: ".$hitsRank." PageRank: ".$prRank."\n";
|
||||
}
|
||||
|
||||
echo "Avg. Rankings - HITS: ".($totalHITS / $numArticles)." PageRank: ".($totalPR / $numArticles)."\n";
|
||||
|
||||
?>
|
||||
BIN
ss2013/1_Web Mining/Uebungen/5_Uebung/Solution.doc
Normal file
BIN
ss2013/1_Web Mining/Uebungen/5_Uebung/Solution.doc
Normal file
Binary file not shown.
BIN
ss2013/1_Web Mining/Uebungen/5_Uebung/Solution.docx
Normal file
BIN
ss2013/1_Web Mining/Uebungen/5_Uebung/Solution.docx
Normal file
Binary file not shown.
BIN
ss2013/1_Web Mining/Uebungen/5_Uebung/Solution.pdf
Normal file
BIN
ss2013/1_Web Mining/Uebungen/5_Uebung/Solution.pdf
Normal file
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
1
ss2013/1_Web Mining/Uebungen/5_Uebung/articles_html/Data
Normal file
1
ss2013/1_Web Mining/Uebungen/5_Uebung/articles_html/Data
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
1
ss2013/1_Web Mining/Uebungen/5_Uebung/articles_html/FICO
Normal file
1
ss2013/1_Web Mining/Uebungen/5_Uebung/articles_html/FICO
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
1
ss2013/1_Web Mining/Uebungen/5_Uebung/articles_html/IEEE
Normal file
1
ss2013/1_Web Mining/Uebungen/5_Uebung/articles_html/IEEE
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user