Initial commit

Functional, without SSO
This commit is contained in:
Jimmy Monin
2016-09-18 11:03:26 +02:00
commit 57708e3169
253 changed files with 30787 additions and 0 deletions

View File

@ -0,0 +1,19 @@
<?php
namespace AdService\Parser;
use AdService\Filter;
abstract class AbstractParser extends \DOMDocument
{
public function __construct()
{
libxml_use_internal_errors(true);
}
/**
* @param string $content
* @param Filter $filter
*/
abstract public function process($content, Filter $filter = null);
}

View File

@ -0,0 +1,177 @@
<?php
namespace AdService\Parser;
use AdService\Filter;
use AdService\Ad;
class Lbc extends AbstractParser
{
protected static $months = array(
"jan" => 1, "fév" => 2, "mars" => 3, "avr" => 4,
"mai" => 5, "juin" => 6, "juillet" => 7, "août" => 8,
"sept" => 9, "oct" => 10, "nov" => 11,
"déc" => 12
);
protected $scheme;
public function process($content, Filter $filter = null, $scheme = "http") {
if (!$content) {
return;
}
$this->scheme = $scheme;
$this->loadHTML($content);
$timeToday = strtotime(date("Y-m-d")." 23:59:59");
$dateYesterday = $timeToday - 24*3600;
$ads = array();
if ($filter) {
$exclude_ids = $filter->getExcludeIds();
/**
* Afin de garder une rétrocompatibilité, on prend en compte
* que $exclude_ids peut être numérique.
*/
if (!is_numeric($exclude_ids) && !is_array($exclude_ids)) {
unset($exclude_ids);
}
}
$adNodes = $this->getElementsByTagName("a");
foreach ($adNodes AS $result) {
// est-ce bien une annonce ?
if (false === strpos($result->getAttribute("class"), "list_item")) {
continue;
}
$ad = new Ad();
$ad->setProfessional(false)->setUrgent(false);
// pas d'ID, pas d'annonce
if (!preg_match('/([0-9]+)\.htm.*/', $result->getAttribute("href"), $m)) {
continue;
}
// permet d'éliminer les annonces déjà envoyées.
if (isset($exclude_ids)) {
if (is_numeric($exclude_ids)) {
/**
* Si $exclude_ids est numérique, alors détection
* à l'ancienne. Quand on rencontre l'ID de la
* dernière annonce, on stoppe la boucle.
*/
if ($m[1] == $exclude_ids) {
break;
}
} elseif (in_array($m[1], $exclude_ids)) {
continue;
}
}
// permet d'éliminer les annonces déjà envoyées.
if ($filter && $m[1] <= $filter->getMinId()) {
continue;
}
$ad->setLink($this->formatLink($result->getAttribute("href")))
->setId($m[1])
->setTitle($result->getAttribute("title"))
->setLinkMobile(str_replace(
array("http://www.", "https://www."),
array("http://mobile.", "https://mobile."),
$ad->getLink()
));
// recherche de l'image
foreach ($result->getElementsByTagName("span") AS $node) {
if ($src = $node->getAttribute("data-imgsrc")) {
$ad->setThumbnailLink($this->formatLink($src));
}
}
$i = 0;
foreach ($result->getElementsByTagName("p") AS $node) {
$class = (string) $node->getAttribute("class");
if (false !== strpos($class, "item_supp")) {
$value = trim($node->nodeValue);
if ($i == 0) { // catégorie
if (false !== strpos($value, "(pro)")) {
$ad->setProfessional(true);
}
$ad->setCategory(trim(str_replace("(pro)", "", $value)));
} elseif ($i == 1) { // localisation
if (false !== strpos($value, "/")) {
$value = explode("/", $value);
$ad->setCountry(trim($value[1]))
->setCity(trim($value[0]));
} else {
$ad->setCountry(trim($value));
}
} elseif ($i == 2) { // date de l'annonce + urgent
$spans = $node->getElementsByTagName("span");
if ($spans->length > 0) {
$ad->setUrgent(true);
$node->removeChild($spans->item(0));
$value = trim($node->nodeValue);
}
$dateStr = preg_replace("#\s+#", " ", $value);
$aDate = explode(' ', $dateStr);
$aDate[1] = trim($aDate[1], ",");
if (false !== strpos($dateStr, 'Aujourd')) {
$time = strtotime(date("Y-m-d")." 00:00:00");
} elseif (false !== strpos($dateStr, 'Hier')) {
$time = strtotime(date("Y-m-d")." 00:00:00");
$time = strtotime("-1 day", $time);
} else {
if (!isset(self::$months[$aDate[1]])) {
continue;
}
$time = strtotime(date("Y")."-".self::$months[$aDate[1]]."-".$aDate[0]);
}
$aTime = explode(":", $aDate[count($aDate) - 1]);
$time += (int)$aTime[0] * 3600 + (int)$aTime[1] * 60;
if ($timeToday < $time) {
$time = strtotime("-1 year", $time);
}
$ad->setDate($time);
}
$i++;
}
}
// recherche du prix
foreach ($result->getElementsByTagName("h3") AS $node) {
$class = (string) $node->getAttribute("class");
if (false !== strpos($class, "item_price")) {
if (preg_match("#[0-9 ]+#", $node->nodeValue, $m)) {
$ad->setPrice((int)str_replace(" ", "", trim($m[0])));
}
}
}
// exclure les annonces ne correspondant pas au filtre.
if ($filter && !$filter->isValid($ad)) {
continue;
}
$ads[$ad->getId()] = $ad;
}
return $ads;
}
protected function formatLink($link)
{
if (0 === strpos($link, "//")) {
$link = $this->scheme.":".$link;
}
return $link;
}
}

View File

@ -0,0 +1,137 @@
<?php
namespace AdService\Parser;
use AdService\Filter;
use AdService\Ad;
class Olx extends AbstractParser
{
protected static $months = array(
"jan" => 1, "fév" => 2, "mars" => 3, "апр." => 4,
"mai" => 5, "juin" => 6, "juillet" => 7, "août" => 8,
"sept" => 9, "oct" => 10, "nov" => 11,
"déc" => 12
);
public function process($content, Filter $filter = null) {
if (!$content) {
return;
}
$content = str_replace("<br/>", " ", $content);
$this->loadHTML($content);
$timeToday = strtotime(date("Y-m-d")." 23:59:59");
$dateYesterday = $timeToday - 24*3600;
$ads = array();
$tables = $this->getElementsByTagName("table");
$tableOffers = null;
foreach ($tables AS $table) {
if (false !== strpos($table->getAttribute("id"), "offers_table")) {
$tableOffers = $table;
break;
}
}
if (!$tableOffers) {
return array();
}
$adNodes = $tableOffers->getElementsByTagName("td");
foreach ($adNodes AS $adNode) {
if (false === strpos($adNode->getAttribute("class"), "offer")) {
continue;
}
$ad = new Ad();
$ad->setUrgent(false);
// aucun indicateur pour savoir si c'est un pro ou non.
$ad->setProfessional(false);
// permet d'éliminer les annonces déjà envoyées.
// @todo pour le moment, pas possible. Les IDs ne semblent pas
// numérique et incrémentals.
// if ($filter && $m[1] <= $filter->getMinId()) {
// continue;
// }
$rows = $adNode->getElementsByTagName("tr");
if (0 == $rows->length) {
continue;
}
$columns = $adNode->getElementsByTagName("td");
$row2_p = $rows->item(1)->getElementsByTagName("p");
// analyse de la date
$dateStr = preg_replace("#\s+#", " ", trim($row2_p->item(1)->nodeValue));
if (!$dateStr) {
continue;
}
$aDate = explode(' ', $dateStr);
if (false !== strpos($dateStr, 'Сегодня')) { // aujourd'hui
$time = strtotime(date("Y-m-d")." 00:00:00");
} elseif (false !== strpos($dateStr, 'Вчера')) {
$time = strtotime(date("Y-m-d")." 00:00:00");
$time = strtotime("-1 day", $time);
} else {
if (!isset(self::$months[$aDate[1]])) {
continue;
}
$time = strtotime(date("Y")."-".self::$months[$aDate[1]]."-".$aDate[0]);
}
$timeStr = $aDate[count($aDate) - 1];
if (false !== $pos = mb_strpos($dateStr, ":")) {
$time += (int)mb_substr($dateStr, $pos - 2, 2) * 3600;
$time += (int)mb_substr($dateStr, $pos + 1, 2) * 60;
if ($timeToday < $time) {
$time = strtotime("-1 year", $time);
}
}
$ad->setDate($time);
// image
$img = $columns->item(0)->getElementsByTagName("img");
if ($img->length) {
$ad->setThumbnailLink(str_replace("94x72", "644x461", $img->item(0)->getAttribute("src")));
}
// titre + lien
$link = $adNode->getElementsByTagName("h3")->item(0)->getElementsByTagName("a")->item(0);
if ($link) {
$ad->setTitle(trim($link->nodeValue));
$ad->setLink($link->getAttribute("href"));
}
// urgent
if (false !== strpos($adNode->nodeValue, "Срочно")) {
$ad->setUrgent(true);
}
// lieu
$ad->setCity(trim($row2_p->item(0)->nodeValue));
// catégorie
$ad->setCategory(trim($columns->item(1)->getElementsByTagName("p")->item(0)->nodeValue));
if (!preg_match("#ID([^.]+)\.html#", $ad->getLink(), $m)) {
continue;
}
$ad->setId(base_convert($m[1], 32, 10));
$priceColumn = trim($columns->item(2)->nodeValue);
if (preg_match('#(?<price>[0-9\s]+)\s+(?<currency>грн|\$|€)#imsU', $priceColumn, $m)) {
$ad->setPrice((int) str_replace(" ", "", $m["price"]))
->setCurrency($m["currency"]);
}
if ($filter && !$filter->isValid($ad)) {
continue;
}
$ads[$ad->getId()] = $ad;
}
return $ads;
}
}

View File

@ -0,0 +1,125 @@
<?php
namespace AdService\Parser;
use AdService\Filter;
use AdService\Ad;
class Seloger extends AbstractParser
{
public function process($content, Filter $filter = null) {
if (!$content) {
return;
}
// pourquoi est-ce nécessaire ?! Je n'ai pas encore trouvé la raison.
$content = utf8_decode($content);
$this->loadHTML($content);
$timeToday = strtotime(date("Y-m-d")." 23:59:59");
$dateYesterday = $timeToday - 24*3600;
$ads = array();
$sections = $this->getElementsByTagName("section");
$section_results = null;
foreach ($sections AS $section) {
if (false !== strpos($section->getAttribute("class"), "liste_resultat")) {
$section_results = $section;
break;
}
}
if (!$section_results) {
return array();
}
if ($filter) {
$exclude_ids = $filter->getExcludeIds();
/**
* Afin de garder une rétrocompatibilité, on prend en compte
* que $exclude_ids peut être numérique.
*/
if (!is_numeric($exclude_ids) && !is_array($exclude_ids)) {
unset($exclude_ids);
}
}
$adNodes = $section_results->getElementsByTagName("article");
foreach ($adNodes AS $adNode) {
if (!$id = (int) $adNode->getAttribute("data-listing-id")) {
continue;
}
// permet d'éliminer les annonces déjà envoyées.
if (isset($exclude_ids)) {
if (is_numeric($exclude_ids)) {
/**
* Si $exclude_ids est numérique, alors détection
* à l'ancienne. Quand on rencontre l'ID de la
* dernière annonce, on stoppe la boucle.
*/
if ($id == $exclude_ids) {
break;
}
} elseif (in_array($id, $exclude_ids)) {
continue;
}
}
$ad = new Ad();
$ad->setUrgent(false)
->setId($id);
// aucun indicateur pour savoir si c'est un pro ou non.
$ad->setProfessional(false);
// image
$imgs = $adNode->getElementsByTagName("img");
if ($imgs->length) {
foreach ($imgs AS $img) {
if (false !== strpos($img->getAttribute("class"), "listing_photo")) {
$ad->setThumbnailLink(
str_replace(
array("c175", "c250"),
"b600",
$img->getAttribute("src"))
);
break;
}
}
}
// titre + lien + lieu
$link = $adNode->getElementsByTagName("h2")->item(0)
->getElementsByTagName("a")->item(0);
if ($link) {
$city = $link->getElementsByTagName("span")->item(0);
if ($city) {
// lieu
$ad->setCity(trim($city->nodeValue));
}
$ad->setTitle(trim($link->nodeValue));
$ad->setLink($link->getAttribute("href"));
}
$links = $adNode->getElementsByTagName("a");
if ($links->length) {
foreach ($links AS $link) {
$classCSS = $link->getAttribute("class");
if (false !== strpos($classCSS, "amount")) {
$ad->setPrice((int) preg_replace("#[^0-9]*#", "", $link->nodeValue));
}
}
}
if ($filter && !$filter->isValid($ad)) {
continue;
}
$ads[$ad->getId()] = $ad;
}
return $ads;
}
}