156 lines
6.7 KiB
PHP
156 lines
6.7 KiB
PHP
<?php
|
|
|
|
namespace App\Libraries\MyCrawler;
|
|
|
|
use App\Libraries\CommonLibrary;
|
|
use Symfony\Component\DomCrawler\Crawler;
|
|
use App\Traits\FileTrait;
|
|
|
|
abstract class MyCrawler extends CommonLibrary
|
|
{
|
|
use FileTrait;
|
|
protected function __construct()
|
|
{
|
|
parent::__construct();
|
|
}
|
|
abstract protected function getMySocket();
|
|
abstract protected function createMyStorage();
|
|
abstract protected function detail_page(int $cnt, array $listInfo): array;
|
|
final protected function getSelector(string $content, string $tag): Crawler
|
|
{
|
|
$crawler = new Crawler($content);
|
|
if ($this->getDebug()) {
|
|
log_message("debug", __FUNCTION__ . "=> " . $tag);
|
|
}
|
|
$crawler->filter($tag);
|
|
if ($this->getDebug()) {
|
|
log_message("debug", sprintf(
|
|
"\n------------%s HTML-------------\n%s\n-----------------------------------------------------\n",
|
|
__FUNCTION__,
|
|
$crawler->filter($tag)->html()
|
|
));
|
|
}
|
|
return $crawler->filter($tag);
|
|
}
|
|
|
|
protected function changeURLByMediaType(string $url): string
|
|
{
|
|
return preg_match('/^[^?]+/', $url, $matches) ? $matches[0] : null;
|
|
}
|
|
protected function getUrlsByMediaType(string $media_type, Crawler $selector, array $options, array $urls = []): array
|
|
{
|
|
$urls[$media_type] = [];
|
|
$selector->filter($options["tag"])->each(
|
|
function (Crawler $node) use (&$media_type, &$options, &$urls): void {
|
|
$url = $node->attr($options["attr"]);
|
|
switch ($media_type) {
|
|
case 'video':
|
|
if ($url === null) {
|
|
$url = $node->children()->attr("src");
|
|
}
|
|
break;
|
|
}
|
|
if ($url !== null && preg_match('/^[^?]+/', $url, $matches)) {
|
|
$urls[$media_type][] = $this->changeURLByMediaType($matches[0]);
|
|
} else {
|
|
log_message("debug", __FUNCTION__ . "-> {$media_type}[{$options["attr"]}]\n");
|
|
log_message("debug", $node->html());
|
|
}
|
|
}
|
|
);
|
|
log_message("notice", "-----------" . __FUNCTION__ . "=> {$media_type} 작업완료--------");
|
|
return $urls;
|
|
}
|
|
private function media_save(int $file_sequence, string $media_type, string $file_name, string $content): mixed
|
|
{
|
|
log_message("debug", __FUNCTION__ . " 원본파일 {$file_name} 작업 시작");
|
|
$storage = $this->createMyStorage();
|
|
$storage->setOriginName($file_name);
|
|
$storage->setOriginContent($content);
|
|
$storage->setOriginType($media_type);
|
|
$storage->setOriginSequence($file_sequence);
|
|
return $storage->save();
|
|
}
|
|
//Yamap ViewPage의 이미지나영상데이터가 있으면 Dodownload 한다.
|
|
private function media_download(string $media_type, string $url): array
|
|
{
|
|
$file_names = explode('/', $url);
|
|
if (!is_array($file_names) || !count($file_names)) {
|
|
throw new \Exception("URL이 파일명 형식이 아닙니다 : " . $this->getMySocket()->getHost() . $url);
|
|
}
|
|
$file_name = array_pop($file_names);
|
|
$temps = explode(".", $file_name);
|
|
$file_ext = array_pop($temps);
|
|
if (!$this->isFileType_FileTrait($file_ext, $media_type)) {
|
|
throw new \Exception("파일명 형식이 {$media_type}가 아닙니다");
|
|
}
|
|
$content = $this->getMySocket()->getContent($url);
|
|
log_message("notice", "{$file_name} 파일이 다운로드되었습니다!");
|
|
return array($file_name, $content);
|
|
}
|
|
final protected function media_process(array $media_urls): array
|
|
{
|
|
// log_message("debug", var_export($media_urls, true));
|
|
$file_sequence = 1;
|
|
$storages = []; //CreateBoard에서 사용을 위해 DetailPage마다 초기화
|
|
// log_message("debug", var_export($urls, true));
|
|
foreach ($media_urls as $media_type => $urls) {
|
|
$total = count($urls);
|
|
foreach ($urls as $url) {
|
|
log_message("notice", __FUNCTION__ . " {$file_sequence}번째/총:{$total} MediaType->{$media_type} 작업 시작");
|
|
try {
|
|
list($file_name, $content) = $this->media_download($media_type, $url);
|
|
$storages[] = $this->media_save($file_sequence, $media_type, $file_name, $content);
|
|
} catch (\Exception $e) {
|
|
log_message("warning", sprintf(
|
|
"\n---%s MediaType->%s {$file_sequence}번째/총:{$total} 오류---\n%s\n-----------------------------------------\n",
|
|
__FUNCTION__,
|
|
$media_type,
|
|
$e->getMessage()
|
|
));
|
|
}
|
|
log_message("notice", __FUNCTION__ . " {$file_sequence}번째/총:{$total} MediaType->{$media_type} 작업 완료");
|
|
$file_sequence++;
|
|
}
|
|
}
|
|
$i = 1;
|
|
$total = count($storages);
|
|
foreach ($storages as $storage) {
|
|
log_message("debug", __FUNCTION__ . " {$i}번째/총:{$total} 결과=>" . $storage->getOriginName());
|
|
$i++;
|
|
}
|
|
return $storages;
|
|
}
|
|
protected function main_process(int $max_limit, array $listInfos): void
|
|
{
|
|
//Limit가 0이면 $listInfos 갯수만큼 다하고, LIMIT 갯수 혹은 item의 갯수중 작은수만큼 한다.
|
|
if ($max_limit) {
|
|
$max_limit = count($listInfos) <= $max_limit ? count($listInfos) : $max_limit;
|
|
} else {
|
|
$max_limit = count($listInfos);
|
|
}
|
|
$total = count($listInfos);
|
|
$i = 1;
|
|
foreach ($listInfos as $listInfo) {
|
|
if ($this->getDebug()) {
|
|
$i = $max_limit;
|
|
}
|
|
if ($i <= $max_limit) {
|
|
log_message("notice", "게시물 {$i}번째/총:{$total} {$listInfo["nickname"]} 작업시작");
|
|
try {
|
|
//listInfo는 title,작성자,작성시간등등의 정보를 가지고 있어 detail_page 처리 안에서 바뀔 수 있으므로 다시 반환 받는다.
|
|
$listInfo = $this->detail_page($i, $listInfo);
|
|
} catch (\Exception $e) {
|
|
log_message("warning", sprintf(
|
|
"\n---%s {$i}번째/총:{$total} 오류---\n%s\n-----------------------------------------\n",
|
|
__FUNCTION__,
|
|
$e->getMessage()
|
|
));
|
|
}
|
|
log_message("notice", "게시물 {$i}번째/총:{$total} {$listInfo["nickname"]} 작업완료.");
|
|
$i++;
|
|
}
|
|
}
|
|
}
|
|
}
|