Automation/app/Libraries/MyCrawler/MyCrawler.php
2024-09-19 19:37:28 +09:00

136 lines
5.9 KiB
PHP

<?php
namespace App\Libraries\MyCrawler;
use App\Libraries\CommonLibrary;
use Symfony\Component\DomCrawler\Crawler;
use App\Traits\FileTrait;
abstract class MyCrawler extends CommonLibrary
{
use FileTrait;
protected function __construct()
{
parent::__construct();
}
abstract protected function getMySocket();
abstract protected function createMyStorage();
final protected function getSelector(string $content, string $tag): Crawler
{
$crawler = new Crawler($content);
if ($this->isDebug) {
log_message("debug", __FUNCTION__ . "=> " . $tag);
}
$crawler->filter($tag);
if ($this->isDebug) {
log_message("debug", sprintf(
"\n------------%s HTML-------------\n%s\n-----------------------------------------------------\n",
__FUNCTION__,
$crawler->filter($tag)->html()
));
}
return $crawler->filter($tag);
}
protected function changeURLByCrawler(string $url): string
{
return preg_match('/^[^?]+/', $url, $matches) ? $matches[0] : null;
}
protected function getUrlByMediaType(Crawler $node, string $media_type, string $attr): null|string
{
switch ($media_type) {
case 'video':
try {
$url = $node->attr($attr); //<video src="test.mp4"></video> 또는 <video data-src="test.mp4"></video>
} catch (\Exception) {
$url = $node->children()->attr("src"); //<video><source src="test.mp4"></source</video>
}
break;
case 'img':
default:
$url = $node->attr($attr);
break;
}
return $url;
}
protected function getUrlsByMediaType(Crawler $selector, string $media_type, string $attr, array $urls = []): array
{
log_message("notice", "-----------" . __FUNCTION__ . "=> {$media_type} 작업시작--------");
$urls[$media_type] = [];
$selector->filter($media_type)->each(
function (Crawler $node) use (&$media_type, &$attr, &$urls): void {
$url = $this->getUrlByMediaType($node, $media_type, $attr);
if ($url !== null && preg_match('/^[^?]+/', $url, $matches)) {
$urls[$media_type][] = $this->changeURLByCrawler($matches[0]);
} else {
log_message("debug", __FUNCTION__ . "-> {$media_type}:{$attr}\n");
//Node 모든 속성은 DOMElement 변환 후 반환가능
$domNode = $node->getNode(0);
if ($domNode->hasAttributes()) {
foreach ($domNode->attributes as $attr) {
log_message("debug", "{$attr->nodeName} = {$attr->nodeValue}");
}
}
}
}
);
log_message("notice", "-----------" . __FUNCTION__ . "=> {$media_type} 작업완료--------");
return $urls;
}
private function media_save(int $file_sequence, string $media_type, string $file_name, string $content): mixed
{
log_message("debug", __FUNCTION__ . " 원본파일 {$file_name} 작업 시작");
$storage = $this->createMyStorage();
$storage->setOriginName($file_name);
$storage->setOriginContent($content);
$storage->setOriginType($media_type);
$storage->setOriginSequence($file_sequence);
return $storage->save();
}
//Yamap ViewPage의 이미지나영상데이터가 있으면 Dodownload 한다.
private function media_download(string $media_type, string $url): array
{
$file_names = explode('/', $url);
if (!is_array($file_names) || !count($file_names)) {
throw new \Exception("URL이 파일명 형식이 아닙니다 : " . $this->getMySocket()->getHost() . $url);
}
$file_name = array_pop($file_names);
$temps = explode(".", $file_name);
$file_ext = array_pop($temps);
if (!$this->isFileType_FileTrait($file_ext, $media_type)) {
throw new \Exception("파일명 형식이 {$media_type}가 아닙니다");
}
$content = $this->getMySocket()->getContent($url);
log_message("notice", "{$file_name} 파일이 다운로드되었습니다!");
return array($file_name, $content);
}
final protected function media_process(array $media_urls): array
{
$file_sequence = 1;
$storages = []; //CreateBoard에서 사용을 위해 DetailPage마다 초기화
foreach ($media_urls as $media_type => $urls) {
$total = count($urls);
foreach ($urls as $url) {
log_message("notice", __FUNCTION__ . " {$file_sequence}번째/총:{$total} MediaType->{$media_type} 작업 시작");
try {
list($file_name, $content) = $this->media_download($media_type, $url);
$storage = $this->media_save($file_sequence, $media_type, $file_name, $content);
log_message("debug", __FUNCTION__ . " {$file_sequence}번째/총:{$total} 결과=>" . $storage->getOriginName());
$storages[] = $storage;
} catch (\Exception $e) {
log_message("warning", sprintf(
"\n---%s MediaType->%s {$file_sequence}번째/총:{$total} 오류---\n%s\n-----------------------------------------\n",
__FUNCTION__,
$media_type,
$e->getMessage()
));
}
log_message("notice", __FUNCTION__ . " {$file_sequence}번째/총:{$total} MediaType->{$media_type} 작업 완료");
$file_sequence++;
}
}
log_message("notice", __FUNCTION__ . "=> 게시물 {$url} 작업종료");
return $storages;
}
}