WebScraping/scrapper/myScrapper.ts
2025-03-04 18:44:09 +09:00

121 lines
3.2 KiB
TypeScript

import axios from "axios";
import * as cheerio from "cheerio";
export interface BoardStruct {
uid?: number;
title: string;
nickname: string;
content: string;
view: number;
recommend?: number;
at_created: string;
detail_url: string;
}
export interface Config {
host: string;
url: string;
rows: string;
title: string;
nickname: string;
content: string;
}
export abstract class MyScrapper {
protected debug: boolean = false;
protected config: Config = {
host: "",
url: "",
rows: "",
title: "",
nickname: "",
content: ""
};
constructor(config:Config,debug: boolean = false) {
this.config = config;
this.debug = debug;
}
//추상 메서드
protected abstract extractRow(row: BoardStruct, element: cheerio.Element, cnt: number): BoardStruct;
private async getHTML(url: string): Promise<string | null> {
try {
const { data } = await axios.get(this.config.host+url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
if (this.debug) {
console.log("getHTML:"+ url + "=>길이:" + data.length);
}
return data;
} catch (error) {
console.error(`Failed to fetch ${url}:`, error);
return null;
}
}
private async extractDetail(url: string,selector: string): Promise<string> {
const html = await this.getHTML(url);
if (!html) return '';
const $ = cheerio.load(html);
return $(selector).html() ?? '';
}
private getListRow(element: cheerio.Cheerio): BoardStruct {
let cnt = 0;
let row: BoardStruct = {
title: '',
nickname: '',
content: '',
at_created: '',
view: 0,
recommend: 0,
detail_url: ''
};
//td 태그를 찾아서 각각의 데이터를 추출
element.find('td').toArray().forEach(tdElement => {
row = this.extractRow(row, tdElement, cnt);
++cnt;
});
if (this.debug) {
console.log("Row:", row);
}
return row;
}
private async getListRows(html:string, selector:string): Promise<BoardStruct[]> {
const $ = cheerio.load(html);
// HTML 구조 디버깅을 위한 코드 추가
if (this.debug) {
console.log("전체 HTML:", $.html().substring(0, 500)); // 처음 500자만 출력
console.log("검색된 요소 수:", $(selector).length);
// 페이지의 모든 div와 id 출력
$('div[id]').each((_, el) => {
console.log("발견된 div id:", $(el).attr('id'));
});
}
// 선택자로 요소 찾기
const rows = $(selector);
const lists: BoardStruct[] = [];
let i = 1;
rows.each((_, element) => {
lists.push(this.getListRow($(element)));
i++;
});
return lists;
}
public async run(): Promise<BoardStruct[]> {
const html = await this.getHTML(this.config.url);
if (!html) return [];
const rows = await this.getListRows(html, this.config.rows);
const lists: BoardStruct[] = [];
for (const row of rows) {
row.content = await this.extractDetail(row.detail_url,this.config.content);
lists.push(row);
}
return lists;
}
}