diff --git a/app/Controllers/CLI/Crawler.php b/app/Controllers/CLI/Crawler.php index cbf4c8d..107e310 100644 --- a/app/Controllers/CLI/Crawler.php +++ b/app/Controllers/CLI/Crawler.php @@ -4,10 +4,8 @@ namespace App\Controllers\CLI; use App\Models\Mangboard\BoardModel; -use App\Libraries\YamapLibrary; use App\Libraries\MyWebLibrary; -use App\Libraries\MyStorage\FileLibrary; -use App\Libraries\MyCrawlerLibrary; +use App\Libraries\MyCrawler\YamapLibrary; use App\Libraries\Mangboard\BoardLibrary; use App\Entities\Mangboard\BoardEntity; use App\Controllers\BaseController; @@ -20,17 +18,9 @@ class Crawler extends BaseController $isDebug = in_array("debug", $params); //1.Yamap사이트에서 자유게시판에서 최근 게시물 데이터 가져오기 if (!in_array("skip_build", $params)) { - $myWeb = new MyWebLibrary(getenv('yamap.host.url')); - $storage = new FileLibrary(WRITEPATH . "uploads"); - $storage->setPath("Yamap"); - $crawler = new MyCrawlerLibrary(); - $yamap = new YamapLibrary(); $yamap->setDebug($isDebug); - $yamap->setMyWeb($myWeb); - $yamap->setMyStorage($storage); - $yamap->setMyCrawler($crawler); - list($title, $nickname, $mediaInfos, $mediaTags) = $yamap->build(); + list($title, $nickname, $mediaInfos, $mediaTags) = $yamap->execute(); } // //2. 사이트 로그인 처리 // if (!in_array("skip_login", $params)) { @@ -58,7 +48,7 @@ class Crawler extends BaseController //망보드에 넣기 $board->create($entity); } - log_message("notice", "Crawler->yapmap 작업이 완료되었습니다."); + log_message("notice", "Crawler->" . __FUNCTION__ . " 작업이 완료되었습니다."); return true; } catch (\Exception $e) { log_message("error", $e->getMessage()); diff --git a/app/Libraries/CommonLibrary.php b/app/Libraries/CommonLibrary.php index 55ba6fb..509b427 100644 --- a/app/Libraries/CommonLibrary.php +++ b/app/Libraries/CommonLibrary.php @@ -15,10 +15,4 @@ abstract class CommonLibrary { $this->_debug = $debug; } - - //url에 http 나 https가 포함되어 있으면 true - final public function isContainsHttpOrHttps($url): bool - { - return strpos($url, 'http://') !== false || strpos($url, 'https://') !== false; - } } diff --git a/app/Libraries/MyCrawler/MyCrawlerLibrary.php b/app/Libraries/MyCrawler/MyCrawlerLibrary.php new file mode 100644 index 0000000..c87d340 --- /dev/null +++ b/app/Libraries/MyCrawler/MyCrawlerLibrary.php @@ -0,0 +1,58 @@ +getMySocket()->getContent($url); + if (!$response) { + throw new \Exception("getCrawler 실패:{$url}"); + } + $crawler = new Crawler($response); + return $crawler->filter($tag); + } + + final protected function getNodes(Crawler $crawler, array $options, $nodes = []): array + { + $crawler->filter($options["tag"])->each( + function (Crawler $node) use (&$options, &$nodes): void { + log_message("debug", sprintf("getNode-> %s", $options["tag"])); + $nodes[] = $node; + } + ); + return $nodes; + } + + final protected function download(Crawler $crawler, array $options): array + { + $downloadInfos = []; + $nodes = $this->getNodes($crawler, $options); + foreach ($nodes as $node) { + $original = $node->attr($options["attr"]); + list($fileName, $content) = $this->getMySocket()->download($original); + $this->getMyStorage()->setFileName($fileName); + if (!$this->getMyStorage()->save($content)) { + continue; + } + $downloadInfos[] = [ + "orignal" => $node->html(), + "path" => $this->getMyStorage()->getPath(), + "fileName" => $fileName, + ]; + } + return $downloadInfos; + } +} diff --git a/app/Libraries/MyCrawler/YamapLibrary.php b/app/Libraries/MyCrawler/YamapLibrary.php new file mode 100644 index 0000000..2d32c33 --- /dev/null +++ b/app/Libraries/MyCrawler/YamapLibrary.php @@ -0,0 +1,125 @@ +_mySocket === null) { + $this->_mySocket = new WebLibrary(getenv('yamap.host.url')); + } + return $this->_mySocket; + } + + public function getMyStorage() + { + if ($this->_myStorage === null) { + $this->_myStorage = new FileLibrary(getenv('yamap.storage.upload.path')); + } + return $this->_myStorage; + } + + private function mainPage(string $url): array + { + $crawler = $this->getContent($url, getenv("yamap.list.tag")); + $item_tag = getenv("yamap.list.item.tag"); + $item_link_tag = getenv("yamap.list.item.link.tag"); + $item_nickname_tag = getenv("yamap.list.item.nickname.tag"); + $item_nickname_except = getenv("yamap.list.item.nickname.except"); + + $lists = []; + //div.bbs_item를 가진 객체를 찾아서 같은 형식의 객체(sibling)를 배열로 넘김 + $crawler->filter($item_tag)->each( + function (Crawler $node) use ( + &$item_link_tag, + &$item_nickname_tag, + &$item_nickname_except, + &$lists + ): void { + //bbs_item에서 span.g_nickname 객체를 찾아서 작성자가 "관리자" 아닌지 확인 후 Return Bool + $nickname = $node->filter($item_nickname_tag)->text(); + log_message("debug", $item_nickname_tag . ":" . $nickname); + if ($nickname != $item_nickname_except) { + //작성자가 "관리자"가 아니 게시물이면 해당 bbs_item에서 a.list_subject 객체를 찾아서 + $link_node = $node->filter($item_link_tag); + $url = $link_node->attr("href"); + $title = $link_node->children()->last()->text(); + $lists[] = ['title' => $title, 'nickname' => $nickname, 'url' => $url]; + } + } + ); + if (!count($lists)) { + throw new \Exception("Target URL이 없습니다."); + } + return array($lists[0]["title"], $lists[0]["nickname"], $lists[0]["url"]); + } + + private function detailPage(string $url): array + { + $crawler = $this->getContent($url, getenv("yamap.view.content.tag")); + $mediaInfos = []; + $mediaTags = []; + //3. Image 처리 + $downloadInfos = $this->download($crawler, ["tag" => "img", "attr" => "src"]); + foreach ($downloadInfos as $downloadInfo) { + if ($this->getMySocket()->isContainsHttpOrHttps($downloadInfo['orignal'])) { + $mediaTags[] = $downloadInfos['orignal']; + } else { + $mediaTags[] = sprintf( + "\"%s\"", + $this->getMyStorage()->getUploadPath(), + $downloadInfo["path"], + $downloadInfo["fileName"], + $downloadInfo["fileName"] + ); + }; + $mediaInfos[] = $downloadInfo; + } + //4. Video(mp4) 처리 + $downloadInfos = $this->download($crawler, ["tag" => "video", "attr" => "src"]); + foreach ($downloadInfos as $downloadInfo) { + if ($this->getMySocket()->isContainsHttpOrHttps($downloadInfo['orignal'])) { + $mediaTags[] = $downloadInfos['orignal']; + } else { + $mediaTags[] = sprintf( + "