diff --git a/app/Config/Routes.php b/app/Config/Routes.php index 22ad419..0829700 100644 --- a/app/Config/Routes.php +++ b/app/Config/Routes.php @@ -34,5 +34,8 @@ $routes->group('mangboard', ['namespace' => 'App\Controllers\Mangboard'], functi $routes->cli('yamap/(:alpha)', 'CrawlerController::yamap/$1'); $routes->cli('yamap/(:alpha)/(:any)', 'CrawlerController::yamap/$1/$2'); $routes->cli('yamap/(:alpha)/(:alphanum)/(:any)', 'CrawlerController::yamap/$1/$2/$3'); + $routes->cli('yamoon/(:alpha)', 'CrawlerController::yamoon/$1'); + $routes->cli('yamoon/(:alpha)/(:any)', 'CrawlerController::yamoon/$1/$2'); + $routes->cli('yamoon/(:alpha)/(:alphanum)/(:any)', 'CrawlerController::yamoon/$1/$2/$3'); }); }); diff --git a/app/Controllers/Mangboard/CrawlerController.php b/app/Controllers/Mangboard/CrawlerController.php index 409f7bc..1a05ece 100644 --- a/app/Controllers/Mangboard/CrawlerController.php +++ b/app/Controllers/Mangboard/CrawlerController.php @@ -2,9 +2,10 @@ namespace App\Controllers\Mangboard; +use App\Libraries\MyCrawler\YamoonCrawler; use App\Libraries\MyCrawler\YamapCrawler; -use App\Controllers\CommonController; use App\Libraries\Mangboard\UserLibrary; +use App\Controllers\CommonController; class CrawlerController extends CommonController { @@ -27,4 +28,23 @@ class CrawlerController extends CommonController return $e->getMessage(); } } + public function yamoon(string $category, string $id = "", string $debug = "false"): string + { + try { + $id = $id == "" ? getenv("mangboard.login.default.id") : $id; + $password = getenv("mangboard.login.default.password"); + //1. 사이트 로그인 처리 + $user_library = new UserLibrary(); + $user_entity = $user_library->login(getenv("mangboard.host.url"), $id, $password); + //2. 필요한 로그인한 사용자정보,Socket,Storage 정의후 Crawler에게 전달. + $crawler = new YamoonCrawler($category); + $crawler->setUserEntity($user_entity); + $crawler->setDebug($debug === "true" ? true : false); + $crawler->execute(); + return "완료되었습니다."; + } catch (\Exception $e) { + log_message("error", $e->getMessage()); + return $e->getMessage(); + } + } } diff --git a/app/Libraries/MyCrawler/YamapCrawler.php b/app/Libraries/MyCrawler/YamapCrawler.php index fc57ba0..70b4ba7 100644 --- a/app/Libraries/MyCrawler/YamapCrawler.php +++ b/app/Libraries/MyCrawler/YamapCrawler.php @@ -135,19 +135,25 @@ class YamapCrawler extends MyCrawlerLibrary { $file_sequence = 1; $this->_storages = []; //CreateBoard에서 사용을 위해 DetailPage마다 초기화 - foreach ($urls as $mediaType => $url) { - try { - list($file_name, $content) = $this->download($mediaType, $url); - $this->save($file_sequence, $mediaType, $file_name, $content); - $file_sequence++; - log_message("notice", __FUNCTION__ . " OriginType->{$mediaType} 작업 완료"); - } catch (\Exception $e) { - log_message("warning", sprintf( - "\n---%s,OriginType->%s 오류---\n%s\n-----------------------------------------\n", - __FUNCTION__, - $mediaType, - $e->getMessage() - )); + // log_message("debug", var_export($urls, true)); + foreach ($urls as $mediaType => $media_urls) { + foreach ($media_urls as $url) { + try { + if ($url === null) { + continue; + } + list($file_name, $content) = $this->download($mediaType, $url); + $this->save($file_sequence, $mediaType, $file_name, $content); + $file_sequence++; + log_message("notice", __FUNCTION__ . " OriginType->{$mediaType} 작업 완료"); + } catch (\Exception $e) { + log_message("warning", sprintf( + "\n---%s,OriginType->%s 오류---\n%s\n-----------------------------------------\n", + __FUNCTION__, + $mediaType, + $e->getMessage() + )); + } } } if (!count($this->_storages)) { @@ -157,26 +163,58 @@ class YamapCrawler extends MyCrawlerLibrary //Yamap ViewPage의 이미지나영상데이터가 있으면 URL과MediaType을 가져온다 private function getUrlsByDetailPageMediaType(string $mediaType, Crawler $selector, array $options, array $urls = []): array { + $urls[$mediaType] = []; $selector->filter($options["tag"])->each( function (Crawler $node) use (&$mediaType, &$options, &$urls): void { - log_message("debug", sprintf( - "getNode->%s[%s]", - $options["tag"], - $node->attr($options['attr']) - )); - $urls[$mediaType] = $node->attr($options["attr"]); + $url = $node->attr($options["attr"]); + log_message("debug", "getUrlsByDetailPageMediaType-> {$mediaType}[{$options["attr"]}]:{$url}"); + if (!is_null($url)) { + $urls[$mediaType][] = $url; + } } ); return $urls; } private function detailPage(array $listInfo): array { + //
+ //
요즘 패션
+ //
+ // + // 괴강고귀 + // + // + // | 추천 (14) | 조회 (432) + // + //
+ //
+ // + // + // + // 2024-09-14 01:53:45 + // + //
+ //
+ //
+ //

+ //

 

+ //
+ //
+ //
+ // + // + //
+ //
+ //
$response = $this->getMySocket()->getContent($listInfo['detail_url']); - $selector = $this->getSelector($response, getenv("yamap.view.content.tag")); + $tag = getenv("yamap.view.content.tag"); + $selector = $this->getSelector($response, $tag); + log_message("debug", "\n-----------detailPage Tag: {$tag}---------------\n{$selector->html()}\n---------------------------\n"); $urls = $this->getUrlsByDetailPageMediaType("image", $selector, ["tag" => "img", "attr" => "src"]); $urls = $this->getUrlsByDetailPageMediaType("video", $selector, ["tag" => "video", "attr" => "src"], $urls); - log_message("notice", sprintf("\n-----------%s 작업완료--------\n%s\n-----------------------\n", __FUNCTION__, var_export($urls, true))); - return $urls; + log_message("debug", "\n-------------------------\n" . var_export($urls, true) . "\n-----------------------\n"); + log_message("notice", "-----------" . __FUNCTION__ . " 작업완료--------"); + return array($listInfo, $urls); } private function mainPage(string $url): array { @@ -226,17 +264,19 @@ class YamapCrawler extends MyCrawlerLibrary } else { $max_limit = count($listInfos); } + $total = count($listInfos); $i = 1; foreach ($listInfos as $listInfo) { if ($i <= $max_limit) { try { - log_message("notice", "게시물 {$i}번째 {$listInfo["nickname"]} 작업시작"); - $this->mediaContent($this->detailPage($listInfo)); + log_message("notice", "게시물 {$i}번째/{$total}개중 {$listInfo["nickname"]} 작업시작"); + list($listInfo, $urls) = $this->detailPage($listInfo); + $this->mediaContent($urls); //File DB 및 Board DB 등록작업 $board_entity = $this->getBoardLibrary()->createByCrawler($i, $listInfo, $this->_storages); $this->getFileLibrary()->createByCrawler($board_entity, $this->_storages); $this->getImageLibrary()->createByCrawler($board_entity, $this->_storages); - log_message("notice", "게시물 {$i}번째 {$listInfo["nickname"]} 작업완료."); + log_message("notice", "게시물 {$i}번째/{$total}개중 {$listInfo["nickname"]} 작업완료."); $i++; } catch (\Exception $e) { log_message("debug", $e->getMessage()); diff --git a/app/Libraries/MyCrawler/YamoonCrawler.php b/app/Libraries/MyCrawler/YamoonCrawler.php new file mode 100644 index 0000000..13130ad --- /dev/null +++ b/app/Libraries/MyCrawler/YamoonCrawler.php @@ -0,0 +1,266 @@ +_category = $category; + } + public function getMySocket() + { + if ($this->_mySocket === null) { + $this->_mySocket = new WebSocket(getenv('yamoon.host.url')); + } + return $this->_mySocket; + } + public function getMyStorage() + { + if ($this->_myStorage === null) { + $this->_myStorage = new MangboardStorage($this->getCategory()); + } + return $this->_myStorage; + } + public function getBoardsLibrary(): BoardsLibrary + { + // $test = $this->getBoard(); + // echo "TEST:{$test}\n"; + // $temp = getenv("mangboard.storage.{$this->getBoard()}.name"); + // echo "Temp:{$temp}\n"; + // exit; + if ($this->_boards_library === null) { + $this->_boards_library = new BoardsLibrary( + $this->getCategory(), + $this->getUserEntity() + ); + } + return $this->_boards_library; + } + public function getBoardLibrary(): BoardLibrary + { + if ($this->_board_library === null) { + $this->_board_library = new BoardLibrary( + $this->getBoardsLibrary()->getEntity(), + $this->getUserEntity() + ); + } + return $this->_board_library; + } + public function getFileLibrary(): FileLibrary + { + if ($this->_file_library === null) { + $this->_file_library = new FileLibrary( + $this->getBoardsLibrary()->getEntity(), + $this->getUserEntity() + ); + } + return $this->_file_library; + } + public function getImageLibrary(): ImageLibrary + { + if ($this->_image_library === null) { + $this->_image_library = new ImageLibrary(); + } + return $this->_image_library; + } + public function getUserEntity(): UserEntity + { + if ($this->_user_entity === null) { + throw new \Exception("사용자정보가 없습니다."); + } + return $this->_user_entity; + } + public function setUserEntity(UserEntity $user_entity): void + { + $this->_user_entity = $user_entity; + } + public function getCategory(): string + { + if ($this->_category == "") { + throw new \Exception("저장할 Category가 정의되지 않았습니다."); + } + return $this->_category; + } + private function save(int $file_sequence, string $mediaType, string $file_name, string $content): void + { + log_message("debug", __FUNCTION__ . " 원본파일 {$file_name} 작업 시작"); + $this->getMyStorage()->setOriginName($file_name); + $this->getMyStorage()->setOriginContent($content); + $this->getMyStorage()->setOriginType($mediaType); + $this->getMyStorage()->setOriginSequence($file_sequence); + $this->_storages[] = $this->getMyStorage()->save(); + } + //Yamap ViewPage의 이미지나영상데이터가 있으면 Dodownload 한다. + private function download(string $mediaType, string $url): array + { + $file_names = explode('/', $url); + if (!is_array($file_names) || !count($file_names)) { + throw new \Exception("URL이 파일명 형식이 아닙니다 : " . $this->getMySocket()->getHost() . $url); + } + $file_name = array_pop($file_names); + $temps = explode(".", $file_name); + $file_ext = array_pop($temps); + if (!$this->isFileType_FileTrait($file_ext, $mediaType)) { + throw new \Exception("파일명 형식이 {$mediaType}가 아닙니다"); + } + $content = $this->getMySocket()->getContent($url); + log_message("notice", "{$file_name} 파일이 다운로드되었습니다!"); + return array($file_name, $content); + } + private function mediaContent(array $urls): void + { + $file_sequence = 1; + $this->_storages = []; //CreateBoard에서 사용을 위해 DetailPage마다 초기화 + // log_message("debug", var_export($urls, true)); + foreach ($urls as $mediaType => $media_urls) { + foreach ($media_urls as $url) { + try { + list($file_name, $content) = $this->download($mediaType, $url); + $this->save($file_sequence, $mediaType, $file_name, $content); + $file_sequence++; + log_message("notice", __FUNCTION__ . " OriginType->{$mediaType} 작업 완료"); + } catch (\Exception $e) { + log_message("warning", sprintf( + "\n---%s,OriginType->%s 오류---\n%s\n-----------------------------------------\n", + __FUNCTION__, + $mediaType, + $e->getMessage() + )); + } + } + } + if (!count($this->_storages)) { + throw new \Exception("Download된 Content가 없습니다."); + } + } + //Yamap ViewPage의 이미지나영상데이터가 있으면 URL과MediaType을 가져온다 + private function getUrlsByDetailPageMediaType(string $mediaType, Crawler $selector, array $options, array $urls = []): array + { + $urls[$mediaType] = []; + $selector->filter($options["tag"])->each( + function (Crawler $node) use (&$mediaType, &$options, &$urls): void { + $url = $node->attr($options["attr"]); + log_message("debug", "getUrlsByDetailPageMediaType-> {$mediaType}[{$options["attr"]}]:{$url}"); + if (!is_null($url)) { + $urls[$mediaType][] = $url; + } + } + ); + return $urls; + } + private function detailPage(array $listInfo): array + { + // log_message("debug", var_export($listInfo, true)); + $url = "/newboard/yamoonboard/" . $listInfo['detail_url']; + $response = $this->getMySocket()->getContent($url); + // log_message("debug", "\n--------------------------\n{$response}\n---------------------------\n"); + //작성시간 + // $selector = $this->getSelector($response, getenv("yamoon.view.regdate.tag")); + // $listInfo['date'] = trim($selector->text()); + //작성내용 + $tag = getenv("yamoon.view.content.tag"); + $selector = $this->getSelector($response, $tag); + log_message("debug", "\n-----------detailPage Tag: {$tag}---------------\n{$selector->html()}\n---------------------------\n"); + $urls = $this->getUrlsByDetailPageMediaType("image", $selector, ["tag" => "img", "attr" => "src"]); + $urls = $this->getUrlsByDetailPageMediaType("video", $selector, ["tag" => "video", "attr" => "src"], $urls); + log_message("debug", "\n-------------------------\n" . var_export($urls, true) . "\n-----------------------\n"); + log_message("notice", "-----------" . __FUNCTION__ . " 작업완료--------"); + return array($listInfo, $urls); + } + private function mainPage(string $url): array + { + $listInfos = []; + $response = $this->getMySocket()->getContent($url); + //div.bbs_item를 가진 객체를 찾아서 같은 형식의 객체(sibling)를 배열로 넘김 + // log_message("debug", sprintf("\n-------------MainPage------------\n%s\n--------------------------\n", $selector->html())); + // + // + // 졸고 있는 여군 + // 6 + // yeeyuu | 6 | 369 | No 89372 | 2024-09-13 + // + //bbs_item에서 span.g_nickname 객체를 찾아서 작성자가 "관리자" 아닌지 확인 후 Return Bool + $selector = $this->getSelector($response, getenv("yamoon.list.tag"))->each( + function (Crawler $node) use (&$listInfos): void { + $link_node = $node->filter(getenv("yamoon.list.item.link.tag")); + $detail_url = $link_node->attr("href"); + $title = $link_node->text(); + $info_node = $node->filter(getenv("yamoon.list.item.info.tag")); + $infos = explode("|", $info_node->text()); + if (trim($infos[4]) == date("Y-m-d")) { + $listInfos[] = ['title' => $title, 'detail_url' => $detail_url, 'nickname' => trim($infos[0]), 'hit' => trim($infos[2]), 'date' => trim($infos[4])]; + } + } + ); + if (!count($listInfos)) { + throw new \Exception("Target URL이 없습니다."); + } + log_message("notice", __FUNCTION__ . " 작업 완료"); + return $listInfos; + } + public function execute(): void + { + if ($this->getDebug()) { + $listInfos = [ + 'title' => getenv("yamoon.view.test.title"), + 'nickname' => getenv("yamoon.view.test.nickname"), + 'detail_url' => getenv("yamoon.view.test.url"), + 'time' => date("Y-m-d H:i:s"), + 'hit' => 1, + ]; + } else { + $listInfos = $this->mainPage(getenv("yamoon.list.url." . $this->getCategory())); + } + //Limit가 0이면 $listInfos 갯수만큼 다하고, LIMIT 갯수 혹은 item의 갯수중 작은수만큼 한다. + $max_limit = intval(getenv("yamoon.list.max_limit")); + if ($max_limit) { + $max_limit = count($listInfos) <= $max_limit ? count($listInfos) : $max_limit; + } else { + $max_limit = count($listInfos); + } + $total = count($listInfos); + $i = 1; + foreach ($listInfos as $listInfo) { + if ($i <= $max_limit) { + try { + log_message("notice", "게시물 {$i}번째/{$total}개중 {$listInfo["nickname"]} 작업시작"); + list($listInfo, $urls) = $this->detailPage($listInfo); + $this->mediaContent($urls); + //File DB 및 Board DB 등록작업 + $board_entity = $this->getBoardLibrary()->createByCrawler($i, $listInfo, $this->_storages); + $this->getFileLibrary()->createByCrawler($board_entity, $this->_storages); + $this->getImageLibrary()->createByCrawler($board_entity, $this->_storages); + log_message("notice", "게시물 {$i}번째/{$total}개중 {$listInfo["nickname"]} 작업완료."); + $i++; + } catch (\Exception $e) { + log_message("debug", $e->getMessage()); + } + } + } + log_message("notice", "Crawler->" . __FUNCTION__ . " 작업이 완료되었습니다."); + } +}