anticor registry import

master
Константин 2023-09-02 21:17:35 +03:00
parent abd41c2477
commit 0a057d67c5
5 changed files with 96 additions and 11 deletions

View File

@ -118,7 +118,7 @@ class Page extends Model {
public static function byUrl($url) {
if ($url = trim($url, '/ ')) {
$query = self::query();
collect(explode('/', $url))->reverse()->values()->each(function($slug, $index) use ($query) {
collect(explode('/', $url))->reverse()->values()->each(function($slug, $index) use($query) {
if ($slug !== '') {
$index ? $query->nthParentSlug($index, $slug) : $query->bySlug($slug);
}

View File

@ -10,8 +10,11 @@ use Illuminate\Support\Facades\Storage;
class FileDownloadService {
protected array $documentMimes = [
'docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'doc' => 'application/msword',
'pdf' => 'application/pdf',
'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'xls' => 'application/vnd.ms-excel',
'pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
];
protected array $imageMimes = [
'jpg' => 'image/jpg',

View File

@ -0,0 +1,85 @@
<?php
namespace App\Services\Registries;
use App\Models\Registries\Category;
use Illuminate\Support\Str;
use PHPHtmlParser\Dom;
class AnticorImportService extends RegistryImportService {
public function test() {
$url = "https://www.faufcc.ru/about-us/protivodeystvie-korruptsii/normativnye-pravovye-akty/vedomstvennye-normativnye/index.php?PAGEN_1=3";
$dom = new Dom;
$dom->loadFromUrl($url);
$pages = $dom->find('.paging__item')->toArray();
$key = null;
foreach ($pages as $k => $page) {
if ($page->class === 'paging__item active') $key = $k;
}
if (isset($key)) $next = $pages[$key + 1] ?? null;
$link = $next ? $next->find('a', 0) : null;
var_dump($link->href ?? null);
}
public function import() {
$nodes = $this->dom->find('.user-container ul li a')->toArray();
foreach ($nodes as $k => $node) {
//if ($k) return;
$category = $this->registry->addCategory(trim($node->text));
$this->parseSubpage($node->href, $category);
$this->parseEntriesPage($node->href, $category);
}
}
public function parseSubpage($url, Category $parent) {
$dom = new Dom;
$dom->loadFromUrl("https://faufcc.ru{$url}");
$nodes = $dom->find('.user-container ul li a')->toArray();
foreach ($nodes as $k => $node) {
$category = $parent->addCategory(trim($node->text));
$this->parseEntriesPage($node->href, $category);
}
}
public function parseEntriesPage($url, Category $parent) {
$dom = new Dom;
$dom->loadFromUrl("https://faufcc.ru{$url}");
$nodes = $dom->find('.user-container *')->toArray();
foreach ($nodes as $k => $node) {
if ($node->tag->name() === 'h3') {
$category = $parent->addCategory(trim($node->text));
} elseif ($node->tag->name() === 'table') {
$this->importEntries($category ?? $parent, $node->find('tr a')->toArray());
}
}
if ($res = $this->getNextPage($dom)) $this->parseEntriesPage($res, $parent);
}
public function importEntries(Category $category, $items) {
foreach ($items as $item) {
$name = Str::limit(Str::replace('&quot;', '"', trim($item->text)), 745);
$name = Str::replace('&#40;', '«', $name);
$name = Str::replace('&#41;', '»', $name);
$entry = $this->registry->entries()->firstOrCreate(['name' => $name, 'category_id' => $category->id ?? 0]);
$asset = $this->download($item->href, 'registries/anticor');
$entry->update(['asset_id' => $asset->id ?? null]);
if (!$asset) $entry->update(['link' => $item->href]);
}
}
public function getNextPage($dom) {
$pages = $dom->find('.paging__item')->toArray();
$key = null;
foreach ($pages as $k => $page) {
if ($page->class === 'paging__item active') $key = $k;
}
$next = isset($key) ? ($pages[$key + 1] ?? null) : null;
return $next ? $next->find('a', 0)->href : null;
}
}

View File

@ -13,14 +13,6 @@ class RegistryImportService {
protected string $url;
protected Dom $dom;
protected array $mimes = [
'docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'pdf' => 'application/pdf',
'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'jpg' => 'image/jpg'
];
public function __construct(Registry $registry, string $url) {
$this->registry = $registry;
$this->url = $url;

View File

@ -73,5 +73,10 @@ Artisan::command('htmlparser:import-news', function() {
}
});
Artisan::command('htmlparser:import-anticor', function() {
$url = 'https://www.faufcc.ru/about-us/protivodeystvie-korruptsii/';
$registry = \App\Models\Pages\Page::byUrl('/o-tsentre/protivodeistvie-korruptsii')->registry;
$service = new \App\Services\Registries\AnticorImportService($registry, $url);
$service->import();
});