diff --git a/app/Config/Routes.php b/app/Config/Routes.php index 928b544..8d461fc 100644 --- a/app/Config/Routes.php +++ b/app/Config/Routes.php @@ -23,6 +23,7 @@ $routes->group('/user', function ($routes) { $routes->group('cli', ['namespace' => 'App\Controllers\CLI'], function ($routes) { $routes->cli('mangboard/level', 'Mangboard::level'); $routes->cli('crawler/yamap', 'Crawler::yamap'); + $routes->cli('crawler/yamap/(:any)', 'Crawler::yamap/$1'); }); $routes->group('admin', ['namespace' => 'App\Controllers\Admin', 'filter' => 'authFilter:manager'], function ($routes) { diff --git a/app/Controllers/CLI/Crawler.php b/app/Controllers/CLI/Crawler.php index b4ae844..d3c79e5 100644 --- a/app/Controllers/CLI/Crawler.php +++ b/app/Controllers/CLI/Crawler.php @@ -3,32 +3,26 @@ namespace App\Controllers\CLI; use App\Controllers\BaseController; -use App\Libraries\MyCrawler\YamapLibrary; +use App\Libraries\YamapLibrary; class Crawler extends BaseController { - public function yamap() + public function yamap(...$params) { try { - $library = new YamapLibrary("https://www.yamap16.com"); - $mainPage = $library->getContent("/Board/List.aspx?id=free"); - $links = $library->getLinks($mainPage); - - //Image형식이나 , Viedeo형식의 Content를 가지고 있으면 - log_message("debug", "viewLink-> " . $links[0]["href"]); - $viewPage = $library->getContent($links[0]["href"]); - // $viewPage = $library->getContent("/Board/View.aspx?id=free&ca=&rno=193046&page=1"); //Image - // $viewPage = $library->getContent("/Board/View.aspx?id=free&ca=&rno=193055&page=1"); //Video - $images = $library->getImages($viewPage); - foreach ($images as $image) { - log_message("debug", "Image-> " . $image['src']); - $library->download($image['src']); - } - $videos = $library->getVideos($viewPage); - foreach ($videos as $video) { - log_message("debug", "Video-> " . $video['src']); - $library->download($video['src']); - } + $isDebug = in_array("debug", $params); + $library = new YamapLibrary(getenv("crawler.yamap.host")); + $library->setDebug($isDebug); + //1. MainPage + $url = getenv("crawler.yamap.url.main"); + $links = $library->getLinksByMainPage($url); + //2. TargetPage : div.contents 가진 객체를 찾아서 첫번쨰 요소에서만 참조 + $url = !in_array("debug", $params) ? getenv("crawler.yamap.url.target") : $links[0]["href"]; + $crawler = $library->getCrawlerByDetailPage($url); + //3. Image + $library->getImages($crawler); + //4. Video + $library->getVideos($crawler); log_message("info", "완료되었습니다."); return true; } catch (\Exception $e) { diff --git a/app/Libraries/MyBaseLibrary.php b/app/Libraries/MyBaseLibrary.php deleted file mode 100644 index 6c131e3..0000000 --- a/app/Libraries/MyBaseLibrary.php +++ /dev/null @@ -1,99 +0,0 @@ -_host = $host; - } - - final public function getHost(): string - { - return $this->_host; - } - - final protected function getClient(): Client - { - if ($this->_client === null) { - $this->_client = new Client(['verify' => false]); - } - return $this->_client; - } - - final protected function getCookieJar() - { - if ($this->_cookieJar === null) { - $this->_cookieJar = new CookieJar(); - } - return $this->_cookieJar; - } - - final public function getContent(string $url): string - { - return $this->getClient()->get($this->gethost() . $url)->getBody(); - } - - // 로그인 메서드 - final public function login($url, $username, $password) - { - try { - $response = $this->getClient()->post($this->gethost() . $url, [ - 'form_params' => [ - 'username' => $username, - 'password' => $password, - ], - 'cookies' => $this->getCookieJar(), - ]); - if ($response->getStatusCode() == 200) { - log_message("info", "로그인 성공!"); - return true; - } else { - log_message("info", "로그인 실패: " . $response->getStatusCode()); - return false; - } - } catch (\Exception $e) { - log_message("error", "파일 다운로드 중 오류 발생: " . $e->getMessage()); - return false; - } - } - - // 파일 다운로드 메서드 - final public function download($url, $path = false) - { - try { - $fileNames = explode('/', $url); - if (!is_array($fileNames) || !count($fileNames)) { - throw new \Exception("Download URL Error:" . $url); - } - $storagePath = WRITEPATH . "uploads"; - $storagePath .= !$path ? '' : DIRECTORY_SEPARATOR . $path; - if (!is_dir($storagePath)) { - if (!mkdir($storagePath)) { - throw new \Exception("Make Directory Error:" . $storagePath); - } - } - $response = $this->getClient()->get($this->gethost() . $url, [ - 'cookies' => $this->getCookieJar(), - 'sink' => $storagePath . DIRECTORY_SEPARATOR . array_pop($fileNames), - ]); - if ($response->getStatusCode() == 200) { - log_message("info", "파일이 성공적으로 다운로드되었습니다!"); - return true; - } else { - log_message("info", "파일 다운로드 실패: " . $response->getStatusCode()); - return false; - } - } catch (\Exception $e) { - log_message("error", "파일 다운로드 중 오류 발생: " . $e->getMessage()); - return false; - } - } -} diff --git a/app/Libraries/MyCrawler/MyCrawlerLibrary.php b/app/Libraries/MyCrawler/MyCrawlerLibrary.php index 9a23b41..dd8b86b 100644 --- a/app/Libraries/MyCrawler/MyCrawlerLibrary.php +++ b/app/Libraries/MyCrawler/MyCrawlerLibrary.php @@ -2,41 +2,78 @@ namespace App\Libraries\MyCrawler; -use App\Libraries\MyBaseLibrary; use Symfony\Component\DomCrawler\Crawler; -abstract class MyCrawlerLibrary extends MyBaseLibrary +class MyCrawlerLibrary { - protected function __construct(string $host) + private $_debug = false; + + public function __construct() {} + + final public function getDebug(): bool { - parent::__construct($host); + return $this->_debug; + } + final public function setDebug(bool $debug): void + { + $this->_debug = $debug; } - final public function getCrawler($html) + final public function createCrawler($html) { return new Crawler($html); } final public function getInnerHTML(string $html, $tag = false) { - return $tag ? $this->getCrawler($html)->filter($tag)->html() : $this->getCrawler($html)->html(); + return $tag ? $this->createCrawler($html)->filter($tag)->html() : $this->createCrawler($html)->html(); } - public function getLinks(string $html, array $options = ["tag" => "a", "attr" => "href"]): array + public function getLinks(Crawler $crawler, array $options = ["tag" => "a", "attr" => "href"]): array { - return $this->getCrawler($html)->filter($options["tag"])->each( + $links = $crawler->filter($options["tag"])->each( function (Crawler $node) use (&$options) { - return ["anchor" => $node->text(), "href" => $node->attr($options["attr"])]; + return [ + "anchor" => $node->text(), + "href" => $node->attr($options["attr"]) + ]; } ); + foreach ($links as $link) { + log_message("debug", "Link-> " . $link['href']); + } + return $links; } - public function getImages(string $html, array $options = ["tag" => "img", "attr" => "src"]): array + public function getImages(Crawler $crawler, array $options = ["tag" => "img", "attr" => "src"]): array { - return $this->getCrawler($html)->filter($options["tag"])->each( + $images = $crawler->filter($options["tag"])->each( function (Crawler $node) use (&$options) { - return ["alt" => $node->text(), "src" => $node->attr($options["attr"])]; + return [ + "alt" => $node->attr('alt'), + "src" => $node->attr($options["attr"]) + ]; } ); + foreach ($images as $image) { + log_message("debug", "Image-> " . $image['src']); + } + return $images; + } + + public function getVideos(Crawler $crawler, array $options = ["tag" => "video", "attr" => "src"]): array + { + $videos = $crawler->filter($options["tag"])->each( + function (Crawler $node) use (&$options) { + return [ + "alt" => $node->attr('alt'), + "src" => $node->attr($options["attr"]) + ]; + } + ); + foreach ($videos as $video) { + log_message("debug", "Video-> " . $video['src']); + } + return $videos; } } diff --git a/app/Libraries/MyCrawler/YamapLibrary.php b/app/Libraries/MyCrawler/YamapLibrary.php deleted file mode 100644 index a8e0bd3..0000000 --- a/app/Libraries/MyCrawler/YamapLibrary.php +++ /dev/null @@ -1,55 +0,0 @@ - "관리자"]): array - { - //div.bbs_item를 가진 객체를 찾아서 배열로 넘김 - $domElements = $this->getCrawler($html)->filter("div.bbs_list div.bbs_item")->first()->siblings(); - $links = []; - foreach ($domElements as $domElement) { - $this->getCrawler($domElement)->filter("span.g_nickname")->each(function (Crawler $node) use (&$options, &$links, &$domElement) { - if ($node->text() != $options["skip"]) { - $links[] = ["anchor" => $node->text(), "href" => $this->getCrawler($domElement)->filter("a.list_subject")->attr("href")]; - } - }); - } - return $links; - } - - public function getImages(string $html, array $options = ["tag" => "img", "attr" => "src"]): array - { - //div.contents 가진 객체를 찾아서 첫번쨰 요소에서만 참조 - $domElement = $this->getCrawler($html)->filter("div.contents")->first(); - return $domElement->filter($options["tag"])->each( - function (Crawler $node) use (&$options) { - return [ - "alt" => $node->attr('alt'), - "src" => $node->attr($options["attr"]) - ]; - } - ); - } - public function getVideos(string $html, array $options = ["tag" => "video", "attr" => "src"]): array - { - //div.contents 가진 객체를 찾아서 첫번쨰 요소에서만 참조 - $domElement = $this->getCrawler($html)->filter("div.contents")->first(); - return $domElement->filter($options["tag"])->each( - function (Crawler $node) use (&$options) { - return [ - "alt" => $node->attr('alt'), - "src" => $node->attr($options["attr"]) - ]; - } - ); - } -} diff --git a/app/Libraries/MyStorage/MyStorageLibrary.php b/app/Libraries/MyStorage/MyStorageLibrary.php new file mode 100644 index 0000000..83f7892 --- /dev/null +++ b/app/Libraries/MyStorage/MyStorageLibrary.php @@ -0,0 +1,28 @@ +_path; + } + final public function setPath(string $path): void + { + $this->_path .= DIRECTORY_SEPARATOR . $path; + } + + final public function getDebug(): bool + { + return $this->_debug; + } + final public function setDebug(bool $debug): void + { + $this->_debug = $debug; + } +} diff --git a/app/Libraries/MyWeb/MyWebLibrary.php b/app/Libraries/MyWeb/MyWebLibrary.php new file mode 100644 index 0000000..d41460e --- /dev/null +++ b/app/Libraries/MyWeb/MyWebLibrary.php @@ -0,0 +1,78 @@ +_host = $host; + } + + final public function getHost(): string + { + return $this->_host; + } + + final public function getClient(): Client + { + if ($this->_client === null) { + $this->_client = new Client(['verify' => false]); + } + return $this->_client; + } + + final public function getCookieJar() + { + if ($this->_cookieJar === null) { + $this->_cookieJar = new CookieJar(); + } + return $this->_cookieJar; + } + + final public function getDebug(): bool + { + return $this->_debug; + } + final public function setDebug(bool $debug): void + { + $this->_debug = $debug; + } + + + final public function getContent(string $url): string + { + return $this->getClient()->get($this->gethost() . $url)->getBody(); + } + + // 로그인 메서드 + final public function login($url, $username, $password) + { + try { + $response = $this->getClient()->post($this->gethost() . $url, [ + 'form_params' => [ + 'username' => $username, + 'password' => $password, + ], + 'cookies' => $this->getCookieJar(), + ]); + if ($response->getStatusCode() == 200) { + log_message("info", "로그인 성공!"); + return true; + } else { + log_message("info", "로그인 실패: " . $response->getStatusCode()); + return false; + } + } catch (\Exception $e) { + log_message("error", "로그인 중 오류 발생: " . $e->getMessage()); + return false; + } + } +} diff --git a/app/Libraries/YamapLibrary.php b/app/Libraries/YamapLibrary.php new file mode 100644 index 0000000..e254ece --- /dev/null +++ b/app/Libraries/YamapLibrary.php @@ -0,0 +1,149 @@ +_host = $host; + } + + public function getMyWeb() + { + if ($this->_web === null) { + $this->_web = new MyWebLibrary($this->getHost()); + $this->_web->setDebug($this->getDebug()); + } + return $this->_web; + } + public function getMyStorage() + { + if ($this->_storage === null) { + $this->_storage = new MyStorageLibrary(); + $this->_storage->setDebug($this->getDebug()); + } + return $this->_storage; + } + public function getMyCrawler() + { + if ($this->_crawler === null) { + $this->_crawler = new MyCrawlerLibrary(); + $this->_crawler->setDebug($this->getDebug()); + } + return $this->_crawler; + } + + final public function getDebug(): bool + { + return $this->_debug; + } + final public function setDebug(bool $debug): void + { + $this->_debug = $debug; + } + final public function getHost(): string + { + return $this->_host; + } + + public function getLinks(Crawler $crawler): array + { + //div.bbs_item를 가진 객체를 찾아서 같은 형식의 객체(sibling)를 배열로 넘김 + $bbs_items = $crawler->filter("div.bbs_item")->first()->siblings(); + $links = []; + foreach ($bbs_items as $bbs_item) { + //bbs_item에서 span.g_nickname 객체를 찾아서 "관리자"가 작성한것이 아닌것을 확인 후 + $this->getMyCrawler()->createCrawler($bbs_item)->filter("span.g_nickname")->each(function (Crawler $node) use (&$links, &$bbs_item) { + if ($node->text() != "관리자") { + //다시 bbs_item에서 a.list_subject 객체를 찾아서 Links에 추가한다. + foreach ($this->getMyCrawler()->getLinks($this->getMyCrawler()->createCrawler($bbs_item), ["tag" => ".list_subject", "attr" => "href"]) as $link) { + array_push($links, $link); + } + } + }); + } + return $links; + } + + public function getImages(Crawler $crawler, array $options = ["tag" => "img", "attr" => "src"]): array + { + $images = $this->getMyCrawler()->getImages($crawler, $options); + foreach ($images as $image) { + $this->download($image['src']); + } + return $images; + } + + public function getVideos(Crawler $crawler, array $options = ["tag" => "video", "attr" => "src"]): array + { + $videos = $this->getMyCrawler()->getVideos($crawler, $options); + foreach ($videos as $video) { + $this->download($video['src']); + } + return $videos; + } + + // 파일 다운로드 메서드 + final public function download($url) + { + try { + $fileNames = explode('/', $url); + if (!is_array($fileNames) || !count($fileNames)) { + throw new \Exception("Download URL Error:" . $url); + } + if (!is_dir($this->getMyStorage()->getPath())) { + if (!mkdir($this->getMyStorage()->getPath())) { + throw new \Exception("Make Directory Error:" . $this->getMyStorage()->getPath()); + } + } + $fullPath = $this->getMyStorage()->getPath() . DIRECTORY_SEPARATOR . array_pop($fileNames); + log_message("debug", "FullPath-> " . $fullPath); + if (!$this->getDebug()) { + $response = $this->getMyWeb()->getClient()->get($this->getMyWeb()->gethost() . $url, [ + 'cookies' => $this->getMyWeb()->getCookieJar(), + 'sink' => $fullPath, + ]); + if ($response->getStatusCode() == 200) { + log_message("info", "파일이 성공적으로 다운로드되었습니다!"); + return true; + } else { + log_message("info", "파일 다운로드 실패: " . $response->getStatusCode()); + return false; + } + } + return true; + } catch (\Exception $e) { + log_message("error", "파일 다운로드 중 오류 발생: " . $e->getMessage()); + return false; + } + } + + public function getLinksByMainPage(string $url): array + { + $html = $this->getMyWeb()->getContent($url);; + $crawler = $this->getMyCrawler()->createCrawler($html)->filter("div.bbs_list")->first();; + $links = $this->getLinks($crawler,); + if (!count($links)) { + throw new \Exception("Target Links가 없습니다."); + } + return $links; + } + public function getCrawlerByDetailPage(string $url): Crawler + { + log_message("debug", "Target-> " . $url); + $html = $this->getMyWeb()->getContent($url);; + return $this->getMyCrawler()->createCrawler($html)->filter("div.contents")->first(); + } +}