From 0a057d67c5adadfa9623c2169092b7a8c2b84767 Mon Sep 17 00:00:00 2001 From: panabonic Date: Sat, 2 Sep 2023 21:17:35 +0300 Subject: [PATCH] anticor registry import --- app/Models/Pages/Page.php | 2 +- app/Services/FileDownloadService.php | 5 +- .../Registries/AnticorImportService.php | 85 +++++++++++++++++++ .../Registries/RegistryImportService.php | 8 -- routes/console.php | 7 +- 5 files changed, 96 insertions(+), 11 deletions(-) create mode 100644 app/Services/Registries/AnticorImportService.php diff --git a/app/Models/Pages/Page.php b/app/Models/Pages/Page.php index f698192..aaf6108 100644 --- a/app/Models/Pages/Page.php +++ b/app/Models/Pages/Page.php @@ -118,7 +118,7 @@ class Page extends Model { public static function byUrl($url) { if ($url = trim($url, '/ ')) { $query = self::query(); - collect(explode('/', $url))->reverse()->values()->each(function($slug, $index) use ($query) { + collect(explode('/', $url))->reverse()->values()->each(function($slug, $index) use($query) { if ($slug !== '') { $index ? $query->nthParentSlug($index, $slug) : $query->bySlug($slug); } diff --git a/app/Services/FileDownloadService.php b/app/Services/FileDownloadService.php index 336f5e2..4bd187d 100644 --- a/app/Services/FileDownloadService.php +++ b/app/Services/FileDownloadService.php @@ -10,8 +10,11 @@ use Illuminate\Support\Facades\Storage; class FileDownloadService { protected array $documentMimes = [ 'docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'doc' => 'application/msword', 'pdf' => 'application/pdf', - 'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' + 'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'xls' => 'application/vnd.ms-excel', + 'pptx' => 'application/vnd.openxmlformats-officedocument.presentationml.presentation' ]; protected array $imageMimes = [ 'jpg' => 'image/jpg', diff --git a/app/Services/Registries/AnticorImportService.php b/app/Services/Registries/AnticorImportService.php new file mode 100644 index 0000000..e150663 --- /dev/null +++ b/app/Services/Registries/AnticorImportService.php @@ -0,0 +1,85 @@ +loadFromUrl($url); + $pages = $dom->find('.paging__item')->toArray(); + $key = null; + foreach ($pages as $k => $page) { + if ($page->class === 'paging__item active') $key = $k; + } + if (isset($key)) $next = $pages[$key + 1] ?? null; + $link = $next ? $next->find('a', 0) : null; + var_dump($link->href ?? null); + } + + public function import() { + $nodes = $this->dom->find('.user-container ul li a')->toArray(); + foreach ($nodes as $k => $node) { + //if ($k) return; + $category = $this->registry->addCategory(trim($node->text)); + $this->parseSubpage($node->href, $category); + $this->parseEntriesPage($node->href, $category); + } + } + + public function parseSubpage($url, Category $parent) { + $dom = new Dom; + $dom->loadFromUrl("https://faufcc.ru{$url}"); + $nodes = $dom->find('.user-container ul li a')->toArray(); + foreach ($nodes as $k => $node) { + $category = $parent->addCategory(trim($node->text)); + $this->parseEntriesPage($node->href, $category); + } + } + + + public function parseEntriesPage($url, Category $parent) { + $dom = new Dom; + $dom->loadFromUrl("https://faufcc.ru{$url}"); + $nodes = $dom->find('.user-container *')->toArray(); + foreach ($nodes as $k => $node) { + if ($node->tag->name() === 'h3') { + $category = $parent->addCategory(trim($node->text)); + } elseif ($node->tag->name() === 'table') { + $this->importEntries($category ?? $parent, $node->find('tr a')->toArray()); + } + } + if ($res = $this->getNextPage($dom)) $this->parseEntriesPage($res, $parent); + } + + + public function importEntries(Category $category, $items) { + foreach ($items as $item) { + $name = Str::limit(Str::replace('"', '"', trim($item->text)), 745); + $name = Str::replace('(', '«', $name); + $name = Str::replace(')', '»', $name); + $entry = $this->registry->entries()->firstOrCreate(['name' => $name, 'category_id' => $category->id ?? 0]); + $asset = $this->download($item->href, 'registries/anticor'); + $entry->update(['asset_id' => $asset->id ?? null]); + if (!$asset) $entry->update(['link' => $item->href]); + } + } + + + public function getNextPage($dom) { + $pages = $dom->find('.paging__item')->toArray(); + $key = null; + foreach ($pages as $k => $page) { + if ($page->class === 'paging__item active') $key = $k; + } + $next = isset($key) ? ($pages[$key + 1] ?? null) : null; + return $next ? $next->find('a', 0)->href : null; + } + + +} \ No newline at end of file diff --git a/app/Services/Registries/RegistryImportService.php b/app/Services/Registries/RegistryImportService.php index f45d6bc..fa61988 100644 --- a/app/Services/Registries/RegistryImportService.php +++ b/app/Services/Registries/RegistryImportService.php @@ -13,14 +13,6 @@ class RegistryImportService { protected string $url; protected Dom $dom; - protected array $mimes = [ - 'docx' => 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - 'pdf' => 'application/pdf', - 'xlsx' => 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', - 'jpg' => 'image/jpg' - ]; - - public function __construct(Registry $registry, string $url) { $this->registry = $registry; $this->url = $url; diff --git a/routes/console.php b/routes/console.php index 2e238d1..b78034c 100644 --- a/routes/console.php +++ b/routes/console.php @@ -73,5 +73,10 @@ Artisan::command('htmlparser:import-news', function() { } }); - +Artisan::command('htmlparser:import-anticor', function() { + $url = 'https://www.faufcc.ru/about-us/protivodeystvie-korruptsii/'; + $registry = \App\Models\Pages\Page::byUrl('/o-tsentre/protivodeistvie-korruptsii')->registry; + $service = new \App\Services\Registries\AnticorImportService($registry, $url); + $service->import(); +});