diff --git a/app/Controllers/CLI/Crawler.php b/app/Controllers/CLI/Crawler.php index 33f3423..b4ae844 100644 --- a/app/Controllers/CLI/Crawler.php +++ b/app/Controllers/CLI/Crawler.php @@ -2,24 +2,32 @@ namespace App\Controllers\CLI; -use App\Libraries\CrawlerLibrary; use App\Controllers\BaseController; +use App\Libraries\MyCrawler\YamapLibrary; class Crawler extends BaseController { public function yamap() { try { - $library = new CrawlerLibrary("https://www.yamap16.com"); - echo "Host-> " . $library->getHost() . "\n"; - // $html = $library->getInnerHTML("/Board/List.aspx?id=free&ca=1"); - // $links = $library->getLinks($html, "a.list_subject"); - $url = "/Board/View.aspx?id=free&ca=1&rno=192681&page=1"; - $html = $library->getInnerHTML($url, "div.contents p"); - $images = $library->getImages($html); + $library = new YamapLibrary("https://www.yamap16.com"); + $mainPage = $library->getContent("/Board/List.aspx?id=free"); + $links = $library->getLinks($mainPage); + + //Image형식이나 , Viedeo형식의 Content를 가지고 있으면 + log_message("debug", "viewLink-> " . $links[0]["href"]); + $viewPage = $library->getContent($links[0]["href"]); + // $viewPage = $library->getContent("/Board/View.aspx?id=free&ca=&rno=193046&page=1"); //Image + // $viewPage = $library->getContent("/Board/View.aspx?id=free&ca=&rno=193055&page=1"); //Video + $images = $library->getImages($viewPage); foreach ($images as $image) { - echo "Image-> " . $image . "\n"; - $library->download($image); + log_message("debug", "Image-> " . $image['src']); + $library->download($image['src']); + } + $videos = $library->getVideos($viewPage); + foreach ($videos as $video) { + log_message("debug", "Video-> " . $video['src']); + $library->download($video['src']); } log_message("info", "완료되었습니다."); return true; diff --git a/app/Libraries/CrawlerLibrary.php b/app/Libraries/CrawlerLibrary.php deleted file mode 100644 index 543f540..0000000 --- a/app/Libraries/CrawlerLibrary.php +++ /dev/null @@ -1,45 +0,0 @@ -getClient()->request('GET', $this->gethost() . $url); - return $response->getBody()->getContents(); - } - - final public function getInnerHTML(string $url, $tag = false) - { - $crawler = new Crawler($this->getContent($url)); - return $tag ? $crawler->filter($tag)->html() : $crawler->html(); - } - - final public function getLinks(string $html, string $tag = "a"): array - { - $crawler = new Crawler($html); - return $crawler->filter($tag)->each( - function (Crawler $node) { - return $node->attr("href"); - } - ); - } - - final public function getImages(string $html, $tag = "img"): array - { - $crawler = new Crawler($html); - return $crawler->filter($tag)->each( - function (Crawler $node) { - return $node->attr("src"); - } - ); - } -} diff --git a/app/Libraries/WebBaseLibrary.php b/app/Libraries/MyBaseLibrary.php similarity index 72% rename from app/Libraries/WebBaseLibrary.php rename to app/Libraries/MyBaseLibrary.php index b3297c2..6c131e3 100644 --- a/app/Libraries/WebBaseLibrary.php +++ b/app/Libraries/MyBaseLibrary.php @@ -5,7 +5,7 @@ namespace App\Libraries; use GuzzleHttp\Cookie\CookieJar; use GuzzleHttp\Client; -abstract class WebBaseLibrary +abstract class MyBaseLibrary { private $_host = ""; private $_client = null; @@ -36,8 +36,13 @@ abstract class WebBaseLibrary return $this->_cookieJar; } + final public function getContent(string $url): string + { + return $this->getClient()->get($this->gethost() . $url)->getBody(); + } + // 로그인 메서드 - public function login($url, $username, $password) + final public function login($url, $username, $password) { try { $response = $this->getClient()->post($this->gethost() . $url, [ @@ -61,22 +66,23 @@ abstract class WebBaseLibrary } // 파일 다운로드 메서드 - public function download($url, $addPath = false) + final public function download($url, $path = false) { try { - $fullPath = WRITEPATH . "uploads"; - $fullPath .= !$addPath ? '' : DIRECTORY_SEPARATOR . $addPath; - if (!is_dir($fullPath)) { - mkdir($fullPath); + $fileNames = explode('/', $url); + if (!is_array($fileNames) || !count($fileNames)) { + throw new \Exception("Download URL Error:" . $url); } - $temps = explode('/', $url); - if (!is_array($temps) || !count($temps)) { - throw new \Exception("URL error:" . var_dump($temps, true)); + $storagePath = WRITEPATH . "uploads"; + $storagePath .= !$path ? '' : DIRECTORY_SEPARATOR . $path; + if (!is_dir($storagePath)) { + if (!mkdir($storagePath)) { + throw new \Exception("Make Directory Error:" . $storagePath); + } } - $file = $fullPath . DIRECTORY_SEPARATOR . array_pop($temps); $response = $this->getClient()->get($this->gethost() . $url, [ 'cookies' => $this->getCookieJar(), - 'sink' => $file, + 'sink' => $storagePath . DIRECTORY_SEPARATOR . array_pop($fileNames), ]); if ($response->getStatusCode() == 200) { log_message("info", "파일이 성공적으로 다운로드되었습니다!"); diff --git a/app/Libraries/MyCrawler/MyCrawlerLibrary.php b/app/Libraries/MyCrawler/MyCrawlerLibrary.php new file mode 100644 index 0000000..9a23b41 --- /dev/null +++ b/app/Libraries/MyCrawler/MyCrawlerLibrary.php @@ -0,0 +1,42 @@ +getCrawler($html)->filter($tag)->html() : $this->getCrawler($html)->html(); + } + + public function getLinks(string $html, array $options = ["tag" => "a", "attr" => "href"]): array + { + return $this->getCrawler($html)->filter($options["tag"])->each( + function (Crawler $node) use (&$options) { + return ["anchor" => $node->text(), "href" => $node->attr($options["attr"])]; + } + ); + } + + public function getImages(string $html, array $options = ["tag" => "img", "attr" => "src"]): array + { + return $this->getCrawler($html)->filter($options["tag"])->each( + function (Crawler $node) use (&$options) { + return ["alt" => $node->text(), "src" => $node->attr($options["attr"])]; + } + ); + } +} diff --git a/app/Libraries/MyCrawler/YamapLibrary.php b/app/Libraries/MyCrawler/YamapLibrary.php new file mode 100644 index 0000000..a8e0bd3 --- /dev/null +++ b/app/Libraries/MyCrawler/YamapLibrary.php @@ -0,0 +1,55 @@ + "관리자"]): array + { + //div.bbs_item를 가진 객체를 찾아서 배열로 넘김 + $domElements = $this->getCrawler($html)->filter("div.bbs_list div.bbs_item")->first()->siblings(); + $links = []; + foreach ($domElements as $domElement) { + $this->getCrawler($domElement)->filter("span.g_nickname")->each(function (Crawler $node) use (&$options, &$links, &$domElement) { + if ($node->text() != $options["skip"]) { + $links[] = ["anchor" => $node->text(), "href" => $this->getCrawler($domElement)->filter("a.list_subject")->attr("href")]; + } + }); + } + return $links; + } + + public function getImages(string $html, array $options = ["tag" => "img", "attr" => "src"]): array + { + //div.contents 가진 객체를 찾아서 첫번쨰 요소에서만 참조 + $domElement = $this->getCrawler($html)->filter("div.contents")->first(); + return $domElement->filter($options["tag"])->each( + function (Crawler $node) use (&$options) { + return [ + "alt" => $node->attr('alt'), + "src" => $node->attr($options["attr"]) + ]; + } + ); + } + public function getVideos(string $html, array $options = ["tag" => "video", "attr" => "src"]): array + { + //div.contents 가진 객체를 찾아서 첫번쨰 요소에서만 참조 + $domElement = $this->getCrawler($html)->filter("div.contents")->first(); + return $domElement->filter($options["tag"])->each( + function (Crawler $node) use (&$options) { + return [ + "alt" => $node->attr('alt'), + "src" => $node->attr($options["attr"]) + ]; + } + ); + } +}