anticor registry import
parent
abd41c2477
commit
0a057d67c5
|
|
@ -118,7 +118,7 @@ class Page extends Model {
|
|||
public static function byUrl($url) {
|
||||
if ($url = trim($url, '/ ')) {
|
||||
$query = self::query();
|
||||
collect(explode('/', $url))->reverse()->values()->each(function($slug, $index) use ($query) {
|
||||
collect(explode('/', $url))->reverse()->values()->each(function($slug, $index) use($query) {
|
||||
if ($slug !== '') {
|
||||
$index ? $query->nthParentSlug($index, $slug) : $query->bySlug($slug);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,8 +10,11 @@ use Illuminate\Support\Facades\Storage;
|
|||
class FileDownloadService {
|
||||
protected array $documentMimes = [
|
||||
'docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'doc' => 'application/msword',
|
||||
'pdf' => 'application/pdf',
|
||||
'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
|
||||
'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'xls' => 'application/vnd.ms-excel',
|
||||
'pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
|
||||
];
|
||||
protected array $imageMimes = [
|
||||
'jpg' => 'image/jpg',
|
||||
|
|
|
|||
|
|
@ -0,0 +1,85 @@
|
|||
<?php
|
||||
|
||||
namespace App\Services\Registries;
|
||||
|
||||
use App\Models\Registries\Category;
|
||||
use Illuminate\Support\Str;
|
||||
use PHPHtmlParser\Dom;
|
||||
|
||||
class AnticorImportService extends RegistryImportService {
|
||||
|
||||
public function test() {
|
||||
$url = "https://www.faufcc.ru/about-us/protivodeystvie-korruptsii/normativnye-pravovye-akty/vedomstvennye-normativnye/index.php?PAGEN_1=3";
|
||||
$dom = new Dom;
|
||||
$dom->loadFromUrl($url);
|
||||
$pages = $dom->find('.paging__item')->toArray();
|
||||
$key = null;
|
||||
foreach ($pages as $k => $page) {
|
||||
if ($page->class === 'paging__item active') $key = $k;
|
||||
}
|
||||
if (isset($key)) $next = $pages[$key + 1] ?? null;
|
||||
$link = $next ? $next->find('a', 0) : null;
|
||||
var_dump($link->href ?? null);
|
||||
}
|
||||
|
||||
public function import() {
|
||||
$nodes = $this->dom->find('.user-container ul li a')->toArray();
|
||||
foreach ($nodes as $k => $node) {
|
||||
//if ($k) return;
|
||||
$category = $this->registry->addCategory(trim($node->text));
|
||||
$this->parseSubpage($node->href, $category);
|
||||
$this->parseEntriesPage($node->href, $category);
|
||||
}
|
||||
}
|
||||
|
||||
public function parseSubpage($url, Category $parent) {
|
||||
$dom = new Dom;
|
||||
$dom->loadFromUrl("https://faufcc.ru{$url}");
|
||||
$nodes = $dom->find('.user-container ul li a')->toArray();
|
||||
foreach ($nodes as $k => $node) {
|
||||
$category = $parent->addCategory(trim($node->text));
|
||||
$this->parseEntriesPage($node->href, $category);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public function parseEntriesPage($url, Category $parent) {
|
||||
$dom = new Dom;
|
||||
$dom->loadFromUrl("https://faufcc.ru{$url}");
|
||||
$nodes = $dom->find('.user-container *')->toArray();
|
||||
foreach ($nodes as $k => $node) {
|
||||
if ($node->tag->name() === 'h3') {
|
||||
$category = $parent->addCategory(trim($node->text));
|
||||
} elseif ($node->tag->name() === 'table') {
|
||||
$this->importEntries($category ?? $parent, $node->find('tr a')->toArray());
|
||||
}
|
||||
}
|
||||
if ($res = $this->getNextPage($dom)) $this->parseEntriesPage($res, $parent);
|
||||
}
|
||||
|
||||
|
||||
public function importEntries(Category $category, $items) {
|
||||
foreach ($items as $item) {
|
||||
$name = Str::limit(Str::replace('"', '"', trim($item->text)), 745);
|
||||
$name = Str::replace('(', '«', $name);
|
||||
$name = Str::replace(')', '»', $name);
|
||||
$entry = $this->registry->entries()->firstOrCreate(['name' => $name, 'category_id' => $category->id ?? 0]);
|
||||
$asset = $this->download($item->href, 'registries/anticor');
|
||||
$entry->update(['asset_id' => $asset->id ?? null]);
|
||||
if (!$asset) $entry->update(['link' => $item->href]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public function getNextPage($dom) {
|
||||
$pages = $dom->find('.paging__item')->toArray();
|
||||
$key = null;
|
||||
foreach ($pages as $k => $page) {
|
||||
if ($page->class === 'paging__item active') $key = $k;
|
||||
}
|
||||
$next = isset($key) ? ($pages[$key + 1] ?? null) : null;
|
||||
return $next ? $next->find('a', 0)->href : null;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -13,14 +13,6 @@ class RegistryImportService {
|
|||
protected string $url;
|
||||
protected Dom $dom;
|
||||
|
||||
protected array $mimes = [
|
||||
'docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'pdf' => 'application/pdf',
|
||||
'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'jpg' => 'image/jpg'
|
||||
];
|
||||
|
||||
|
||||
public function __construct(Registry $registry, string $url) {
|
||||
$this->registry = $registry;
|
||||
$this->url = $url;
|
||||
|
|
|
|||
|
|
@ -73,5 +73,10 @@ Artisan::command('htmlparser:import-news', function() {
|
|||
}
|
||||
});
|
||||
|
||||
|
||||
Artisan::command('htmlparser:import-anticor', function() {
|
||||
$url = 'https://www.faufcc.ru/about-us/protivodeystvie-korruptsii/';
|
||||
$registry = \App\Models\Pages\Page::byUrl('/o-tsentre/protivodeistvie-korruptsii')->registry;
|
||||
$service = new \App\Services\Registries\AnticorImportService($registry, $url);
|
||||
$service->import();
|
||||
});
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue