121 lines
3.2 KiB
TypeScript
121 lines
3.2 KiB
TypeScript
import axios from "axios";
|
|
import * as cheerio from "cheerio";
|
|
|
|
export interface BoardStruct {
|
|
uid?: number;
|
|
title: string;
|
|
nickname: string;
|
|
content: string;
|
|
view: number;
|
|
recommend?: number;
|
|
at_created: string;
|
|
detail_url: string;
|
|
}
|
|
|
|
export interface Config {
|
|
host: string;
|
|
url: string;
|
|
rows: string;
|
|
title: string;
|
|
nickname: string;
|
|
content: string;
|
|
}
|
|
|
|
export abstract class MyScrapper {
|
|
protected debug: boolean = false;
|
|
protected config: Config = {
|
|
host: "",
|
|
url: "",
|
|
rows: "",
|
|
title: "",
|
|
nickname: "",
|
|
content: ""
|
|
};
|
|
constructor(config:Config,debug: boolean = false) {
|
|
this.config = config;
|
|
this.debug = debug;
|
|
}
|
|
//추상 메서드
|
|
protected abstract extractRow(row: BoardStruct, element: cheerio.Element, cnt: number): BoardStruct;
|
|
|
|
private async getHTML(url: string): Promise<string | null> {
|
|
try {
|
|
const { data } = await axios.get(this.config.host+url, {
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
}
|
|
});
|
|
if (this.debug) {
|
|
console.log("getHTML:"+ url + "=>길이:" + data.length);
|
|
}
|
|
return data;
|
|
} catch (error) {
|
|
console.error(`Failed to fetch ${url}:`, error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private async extractDetail(url: string,selector: string): Promise<string> {
|
|
const html = await this.getHTML(url);
|
|
if (!html) return '';
|
|
const $ = cheerio.load(html);
|
|
return $(selector).html() ?? '';
|
|
}
|
|
|
|
private getListRow(element: cheerio.Cheerio): BoardStruct {
|
|
let cnt = 0;
|
|
let row: BoardStruct = {
|
|
title: '',
|
|
nickname: '',
|
|
content: '',
|
|
at_created: '',
|
|
view: 0,
|
|
recommend: 0,
|
|
detail_url: ''
|
|
};
|
|
//td 태그를 찾아서 각각의 데이터를 추출
|
|
element.find('td').toArray().forEach(tdElement => {
|
|
row = this.extractRow(row, tdElement, cnt);
|
|
++cnt;
|
|
});
|
|
if (this.debug) {
|
|
console.log("Row:", row);
|
|
}
|
|
return row;
|
|
}
|
|
|
|
private async getListRows(html:string, selector:string): Promise<BoardStruct[]> {
|
|
const $ = cheerio.load(html);
|
|
// HTML 구조 디버깅을 위한 코드 추가
|
|
if (this.debug) {
|
|
console.log("전체 HTML:", $.html().substring(0, 500)); // 처음 500자만 출력
|
|
console.log("검색된 요소 수:", $(selector).length);
|
|
|
|
// 페이지의 모든 div와 id 출력
|
|
$('div[id]').each((_, el) => {
|
|
console.log("발견된 div id:", $(el).attr('id'));
|
|
});
|
|
}
|
|
// 선택자로 요소 찾기
|
|
const rows = $(selector);
|
|
const lists: BoardStruct[] = [];
|
|
let i = 1;
|
|
rows.each((_, element) => {
|
|
lists.push(this.getListRow($(element)));
|
|
i++;
|
|
});
|
|
return lists;
|
|
}
|
|
|
|
public async run(): Promise<BoardStruct[]> {
|
|
const html = await this.getHTML(this.config.url);
|
|
if (!html) return [];
|
|
const rows = await this.getListRows(html, this.config.rows);
|
|
const lists: BoardStruct[] = [];
|
|
for (const row of rows) {
|
|
row.content = await this.extractDetail(row.detail_url,this.config.content);
|
|
lists.push(row);
|
|
}
|
|
return lists;
|
|
}
|
|
} |