Automation/app/Libraries/MyCrawler/MyCrawler.php
2024-09-16 14:57:18 +09:00

162 lines
7.1 KiB
PHP

<?php
namespace App\Libraries\MyCrawler;
use App\Libraries\CommonLibrary;
use Symfony\Component\DomCrawler\Crawler;
use App\Traits\FileTrait;
abstract class MyCrawler extends CommonLibrary
{
use FileTrait;
private $_mySocket = null;
protected $_storages = [];
protected function __construct($mySocket)
{
parent::__construct();
$this->_mySocket = $mySocket;
}
abstract protected function getMyStorage();
abstract protected function list_page(): array;
abstract protected function detail_page(array $listInfo): array;
abstract protected function backend_process(int $i, array $listInfo, array $storages);
final protected function getMySocket()
{
if ($this->_mySocket === null) {
throw new \Exception("Socket이 지정되지 않았습니다.");
}
return $this->_mySocket;
}
final protected function getSelector(string $content, string $tag): Crawler
{
$crawler = new Crawler($content);
if ($this->getDebug()) {
log_message("debug", sprintf(
"\n---------%s----------\ntag:%s\n%s\n-------------------\n",
__FUNCTION__,
$tag,
$content
));
exit;
}
return $crawler->filter($tag);
}
//--------미디어 URL관련------
private function getMediaUrlsByMediaType(string $media_type, Crawler $selector, array $options, array $urls = []): array
{
$urls[$media_type] = [];
$selector->filter($options["tag"])->each(
function (Crawler $node) use (&$media_type, &$options, &$urls): void {
$url = $node->attr($options["attr"]);
log_message("debug", __FUNCTION__ . "-> {$media_type}[{$options["attr"]}]:{$url}");
if (!is_null($url)) {
$urls[$media_type][] = $url;
}
}
);
return $urls;
}
//detailPage의 이미지나영상데이터가 있으면 URL과MediaType을 가져온다
final protected function getMediaUrls(string $response, string $tag, array $listInfo): array
{
$selector = $this->getSelector($response, $tag);
log_message("debug", "\n-----------detailPage Tag: {$tag}---------------\n{$selector->html()}\n---------------------------\n");
$urls = $this->getMediaUrlsByMediaType("image", $selector, ["tag" => "img", "attr" => "src"]);
$urls = $this->getMediaUrlsByMediaType("video", $selector, ["tag" => "video", "attr" => "src"], $urls);
log_message("debug", "\n-------------------------\n" . var_export($urls, true) . "\n-----------------------\n");
log_message("notice", "-----------" . __FUNCTION__ . " 작업완료--------");
return array($listInfo, $urls);
}
//--------미디어 관련-------
private function media_save(int $file_sequence, string $media_type, string $file_name, string $content): void
{
log_message("debug", __FUNCTION__ . " 원본파일 {$file_name} 작업 시작");
$this->getMyStorage()->setOriginName($file_name);
$this->getMyStorage()->setOriginContent($content);
$this->getMyStorage()->setOriginType($media_type);
$this->getMyStorage()->setOriginSequence($file_sequence);
$this->_storages[] = $this->getMyStorage()->save();
}
//Yamap ViewPage의 이미지나영상데이터가 있으면 Dodownload 한다.
private function media_download(string $media_type, string $url): array
{
$file_names = explode('/', $url);
if (!is_array($file_names) || !count($file_names)) {
throw new \Exception("URL이 파일명 형식이 아닙니다 : " . $this->getMySocket()->getHost() . $url);
}
$file_name = array_pop($file_names);
$temps = explode(".", $file_name);
$file_ext = array_pop($temps);
if (!$this->isFileType_FileTrait($file_ext, $media_type)) {
throw new \Exception("파일명 형식이 {$media_type}가 아닙니다");
}
$content = $this->getMySocket()->getContent($url);
log_message("notice", "{$file_name} 파일이 다운로드되었습니다!");
return array($file_name, $content);
}
final protected function media_process(array $media_urls): array
{
$file_sequence = 1;
$this->_storages = []; //CreateBoard에서 사용을 위해 DetailPage마다 초기화
// log_message("debug", var_export($urls, true));
foreach ($media_urls as $media_type => $urls) {
foreach ($urls as $url) {
try {
if ($url === null) {
continue;
}
list($file_name, $content) = $this->media_download($media_type, $url);
$this->media_save($file_sequence, $media_type, $file_name, $content);
$file_sequence++;
log_message("notice", __FUNCTION__ . " OriginType->{$media_type} 작업 완료");
} catch (\Exception $e) {
log_message("warning", sprintf(
"\n---%s mediaType->%s 오류---\n%s\n-----------------------------------------\n",
__FUNCTION__,
$media_type,
$e->getMessage()
));
}
}
}
if (!count($this->_storages)) {
throw new \Exception("Download된 Content가 없습니다.");
}
return $this->_storages;
}
protected function main_process(int $max_limit, array $listInfos): void
{
//Limit가 0이면 $listInfos 갯수만큼 다하고, LIMIT 갯수 혹은 item의 갯수중 작은수만큼 한다.
if ($max_limit) {
$max_limit = count($listInfos) <= $max_limit ? count($listInfos) : $max_limit;
} else {
$max_limit = count($listInfos);
}
$total = count($listInfos);
$i = 1;
foreach ($listInfos as $listInfo) {
if ($i <= $max_limit) {
try {
log_message("notice", "게시물 {$i}번째/{$total}개중 {$listInfo["nickname"]} 작업시작");
//listInfo는 title,작성자,작성시간등등의 정보를 가지고 있어 detail_page 처리 안에서 바뀔 수 있으므로 다시 반환 받는다.
list($listInfo, $media_urls) = $this->detail_page($listInfo);
//Image 나 Video 소스들의 url을 가져와서 실제 다운받는 처리
$this->media_process($media_urls);
//File DB 및 Board DB 등록작업등
$this->backend_process($i, $listInfo, $this->_storages);
log_message("notice", "게시물 {$i}번째/{$total}개중 {$listInfo["nickname"]} 작업완료.");
$i++;
} catch (\Exception $e) {
log_message("warning", sprintf(
"\n---%s 오류---\n%s\n-----------------------------------------\n",
__FUNCTION__,
$e->getMessage()
));
}
}
}
}
}