anticor registry import
parent
abd41c2477
commit
0a057d67c5
|
|
@ -10,8 +10,11 @@ use Illuminate\Support\Facades\Storage;
|
||||||
class FileDownloadService {
|
class FileDownloadService {
|
||||||
protected array $documentMimes = [
|
protected array $documentMimes = [
|
||||||
'docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
'docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||||
|
'doc' => 'application/msword',
|
||||||
'pdf' => 'application/pdf',
|
'pdf' => 'application/pdf',
|
||||||
'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
|
'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||||
|
'xls' => 'application/vnd.ms-excel',
|
||||||
|
'pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
|
||||||
];
|
];
|
||||||
protected array $imageMimes = [
|
protected array $imageMimes = [
|
||||||
'jpg' => 'image/jpg',
|
'jpg' => 'image/jpg',
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,85 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
namespace App\Services\Registries;
|
||||||
|
|
||||||
|
use App\Models\Registries\Category;
|
||||||
|
use Illuminate\Support\Str;
|
||||||
|
use PHPHtmlParser\Dom;
|
||||||
|
|
||||||
|
class AnticorImportService extends RegistryImportService {
|
||||||
|
|
||||||
|
public function test() {
|
||||||
|
$url = "https://www.faufcc.ru/about-us/protivodeystvie-korruptsii/normativnye-pravovye-akty/vedomstvennye-normativnye/index.php?PAGEN_1=3";
|
||||||
|
$dom = new Dom;
|
||||||
|
$dom->loadFromUrl($url);
|
||||||
|
$pages = $dom->find('.paging__item')->toArray();
|
||||||
|
$key = null;
|
||||||
|
foreach ($pages as $k => $page) {
|
||||||
|
if ($page->class === 'paging__item active') $key = $k;
|
||||||
|
}
|
||||||
|
if (isset($key)) $next = $pages[$key + 1] ?? null;
|
||||||
|
$link = $next ? $next->find('a', 0) : null;
|
||||||
|
var_dump($link->href ?? null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function import() {
|
||||||
|
$nodes = $this->dom->find('.user-container ul li a')->toArray();
|
||||||
|
foreach ($nodes as $k => $node) {
|
||||||
|
//if ($k) return;
|
||||||
|
$category = $this->registry->addCategory(trim($node->text));
|
||||||
|
$this->parseSubpage($node->href, $category);
|
||||||
|
$this->parseEntriesPage($node->href, $category);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function parseSubpage($url, Category $parent) {
|
||||||
|
$dom = new Dom;
|
||||||
|
$dom->loadFromUrl("https://faufcc.ru{$url}");
|
||||||
|
$nodes = $dom->find('.user-container ul li a')->toArray();
|
||||||
|
foreach ($nodes as $k => $node) {
|
||||||
|
$category = $parent->addCategory(trim($node->text));
|
||||||
|
$this->parseEntriesPage($node->href, $category);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public function parseEntriesPage($url, Category $parent) {
|
||||||
|
$dom = new Dom;
|
||||||
|
$dom->loadFromUrl("https://faufcc.ru{$url}");
|
||||||
|
$nodes = $dom->find('.user-container *')->toArray();
|
||||||
|
foreach ($nodes as $k => $node) {
|
||||||
|
if ($node->tag->name() === 'h3') {
|
||||||
|
$category = $parent->addCategory(trim($node->text));
|
||||||
|
} elseif ($node->tag->name() === 'table') {
|
||||||
|
$this->importEntries($category ?? $parent, $node->find('tr a')->toArray());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ($res = $this->getNextPage($dom)) $this->parseEntriesPage($res, $parent);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public function importEntries(Category $category, $items) {
|
||||||
|
foreach ($items as $item) {
|
||||||
|
$name = Str::limit(Str::replace('"', '"', trim($item->text)), 745);
|
||||||
|
$name = Str::replace('(', '«', $name);
|
||||||
|
$name = Str::replace(')', '»', $name);
|
||||||
|
$entry = $this->registry->entries()->firstOrCreate(['name' => $name, 'category_id' => $category->id ?? 0]);
|
||||||
|
$asset = $this->download($item->href, 'registries/anticor');
|
||||||
|
$entry->update(['asset_id' => $asset->id ?? null]);
|
||||||
|
if (!$asset) $entry->update(['link' => $item->href]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public function getNextPage($dom) {
|
||||||
|
$pages = $dom->find('.paging__item')->toArray();
|
||||||
|
$key = null;
|
||||||
|
foreach ($pages as $k => $page) {
|
||||||
|
if ($page->class === 'paging__item active') $key = $k;
|
||||||
|
}
|
||||||
|
$next = isset($key) ? ($pages[$key + 1] ?? null) : null;
|
||||||
|
return $next ? $next->find('a', 0)->href : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -13,14 +13,6 @@ class RegistryImportService {
|
||||||
protected string $url;
|
protected string $url;
|
||||||
protected Dom $dom;
|
protected Dom $dom;
|
||||||
|
|
||||||
protected array $mimes = [
|
|
||||||
'docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
||||||
'pdf' => 'application/pdf',
|
|
||||||
'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
||||||
'jpg' => 'image/jpg'
|
|
||||||
];
|
|
||||||
|
|
||||||
|
|
||||||
public function __construct(Registry $registry, string $url) {
|
public function __construct(Registry $registry, string $url) {
|
||||||
$this->registry = $registry;
|
$this->registry = $registry;
|
||||||
$this->url = $url;
|
$this->url = $url;
|
||||||
|
|
|
||||||
|
|
@ -73,5 +73,10 @@ Artisan::command('htmlparser:import-news', function() {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Artisan::command('htmlparser:import-anticor', function() {
|
||||||
|
$url = 'https://www.faufcc.ru/about-us/protivodeystvie-korruptsii/';
|
||||||
|
$registry = \App\Models\Pages\Page::byUrl('/o-tsentre/protivodeistvie-korruptsii')->registry;
|
||||||
|
$service = new \App\Services\Registries\AnticorImportService($registry, $url);
|
||||||
|
$service->import();
|
||||||
|
});
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue