Парсер referats.yandex

Дата января 15, 2009

Комментариев нет

Каму надо тот знает зачем =)

А если его немного переписать можно открыть блог посвященный А. С. Пушкину, а новые стихи брать все там же на yandex =)

<?php

function process_urls($urls)
{
$errno = 0;
$errstr = ”;
$texts = array();
$ids = array();
$eofs = array();
$counter = 0;
while (in_array(false, $ids) || (count($urls) != 0 && count($ids) == 0))
{
foreach ($urls as $id => $href)
{
if (!isset($ids[$id]) || $ids[$id] === false)
{
if ($counter != 0 && ceil($counter / 100) == $counter / 100)
sleep(1);
$counter ++;
$href = parse_url($href);
$ids[$id] = fsockopen(’213.180.204.20′, 80, $errno, $errstr, 5);
if ($ids[$id])
{
$out = “GET “.$href['path'].”?”.$href['query'].” HTTP/1.1\r\n”;
$out .= “Host: “.$href['host'].”\r\n”;
$out .= “Connection: Close\r\n\r\n”;
fwrite($ids[$id], $out);
$eofs[$id] = false;
$texts[$id] = ”;
}
}
}
}
while (in_array(false, $eofs))
{
foreach ($eofs as $id => $eof)
{
if ($eof == false)
{
$texts[$id] .= fread($ids[$id], 4096);
$eofs[$id] = feof($ids[$id]);
}
}
}
foreach ($ids as $id => $id_val)
fclose($ids[$id]);

foreach ($texts as $id => $href)
{
$text = $texts[$id];
$text = str_replace(array(”\n”, “\r”, ”, $text);
preg_match(
‘#<h1 style=”color\:black\; margin\-left\:0\;”>Тема\: \«([^»]+)»</h1>((<p>[^<]+>/p<+)</div></td>#Sim’,
$text, $matches);
preg_match_all(’#<p>([^<]+)</p>#Sim’, $matches[2], $matches2);
mysql_query(
‘INSERT DELAYED INTO data (title, textdata) VALUES (”‘.mysql_escape_string($matches[1]).
‘”, “‘.mysql_escape_string(implode(”\r\n”, $matches2[1])).
‘”‘);
}
}

set_time_limit(3600);

mysql_connect(’localhost’, ‘root’, ‘root’);
mysql_select_db(’data’);
mysql_query(’CREATE TABLE IF NOT EXISTS `data` (
`title` VARCHAR( 255 ) NOT NULL ,
`textdata` TEXT NOT NULL
);’);
mysql_query(’TRUNCATE TABLE `data`’);

$themes = array(’astronomy’, ‘geology’, ‘gyroscope’, ‘literature’, ‘marketing’, ‘mathematics’, ‘music’, ‘polit’,
‘agrobiologia’, ‘law’, ‘psychology’, ‘geography’, ‘physics’, ‘philosophy’, ‘chemistry’, ‘estetica’);
$urls = array();

for ($i = 0; $i <= 50000; $i++)
{
$themes_copy = $themes;
$use_themes = array();
do
{
$rand_index = array_rand($themes_copy);
$use_themes[] = $themes_copy[$rand_index];
unset($themes_copy[$rand_index]);
} while (count($themes_copy) && rand(0, 5));
$urls[] = ‘http://referats.yandex.ru/all.xml?mix=’.implode(’%2c’, $use_themes).’&’.implode(’=on&’, $use_themes).’=on’;
}

while (count($urls))
{
process_urls(array_splice($urls, -10));
}

Автор: Сергей Сусиков
http://angerslave.org.ru

Добавить в закладки

Автор Diverse