This commit is contained in:
최준흠 2025-03-04 18:44:09 +09:00
commit 088ef7e3d4
14 changed files with 2365 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
.env
node_modules/
build/
coverage/

45
README.md Normal file
View File

@ -0,0 +1,45 @@
# WebScrapper
설정파일
.env 파일
//사용 Language : TypeScript
npm init -y
npm install typescript tsx @types/node --save-dev
npx tsc --init
//참고자료: [https://mycodings.fly.dev/blog/2023-08-07-sveltekit-with-prisma-and-deploy-to-fly-io](https://mycodings.fly.dev/blog/2023-08-07-sveltekit-with-prisma-and-deploy-to-fly-io)
//사용 ORM : Prisma
설치법 : npm install prisma --save-dev
//Prisma Client
설치법 : npm install @prisma/client
//사용 DB : Sqlite
설치법 : npx prisma init --datasource-provider sqlite
//Prisma DB 생성
a. npx prisma migrate dev --name init (prisma 초기화)
b. prisma/schema.prisma에 Model(Table) 구조부터 생성
b. npx prisma format (Model을 검증하는듯하다.)
npx prisma db push (db 초기화/생성 : sqlite)
npx prisma generate (Prisma Client 클라이언트를 생성)
//prisma seed관련 (DB에 자동으로 초기값 넣는 seed 파일 만들기)
a. lib 폴더에 database.ts 파일을 만듭시다.
b. package.json에 prisma관련 seed 프로그램 추가
"prisma": {
"seed": "node prisma/seed.js"
}
c. prisma/seed.js를 만든다.
d. bcrypt는 암호를 해시 하는 툴 설치
npm i bcrypt
npm i -D @types/bcrypt prisma
d. 실제 DB 적용하기
npx prisma migrate dev
// 실행후 메세지
// Environment variables loaded from .env
// Prisma schema loaded from prisma\schema.prisma
// Datasource "db": SQLite database "dev.db" at "file:./dev.db"
//실행방법
ts-node app.ts jjang0u [true|false]

18
app.ts Normal file
View File

@ -0,0 +1,18 @@
import { Jjang0u } from "./scrapper/jjang0u";
async function main(){
try {
let scraper;
const siteName = process.argv[2];
if (siteName === "jjang0u") {
scraper = new Jjang0u(process.argv[3] === "true");
} else {
throw new Error("Unknown site name: " + siteName);
}
const lists = await scraper.run();
console.log("Data List:", lists);
} catch (error) {
console.error("Scraping failed:", error);
}
}
//실행
main();

3
lib/database.ts Normal file
View File

@ -0,0 +1,3 @@
import prisma from "@prisma/client";
export const db = new prisma.PrismaClient();

1875
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

36
package.json Normal file
View File

@ -0,0 +1,36 @@
{
"name": "webscraping",
"version": "1.0.0",
"main": "index.js",
"type": "module",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"repository": {
"type": "git",
"url": "http://gitlab.idcjp.jp:3000/idcjp/WebScraping.git"
},
"keywords": [],
"author": "",
"license": "ISC",
"description": "",
"devDependencies": {
"@types/bcrypt": "^5.0.2",
"@types/node": "^22.13.8",
"prisma": "^6.4.1"
},
"dependencies": {
"@prisma/client": "^6.4.1",
"@types/cheerio": "^0.22.35",
"@types/dotenv": "^6.1.1",
"@types/uuid": "^10.0.0",
"axios": "^1.8.1",
"bcrypt": "^5.1.1",
"cheerio": "^1.0.0",
"dotenv": "^16.4.7",
"uuid": "^11.1.0"
},
"prisma": {
"seed": "node prisma/seed.js"
}
}

BIN
prisma/dev.db Normal file

Binary file not shown.

View File

@ -0,0 +1,34 @@
-- CreateTable
CREATE TABLE "User" (
"uid" TEXT NOT NULL PRIMARY KEY,
"email" TEXT NOT NULL,
"name" TEXT,
"nickname" TEXT NOT NULL,
"passwordHash" TEXT NOT NULL,
"userAuthToken" TEXT NOT NULL,
"role" TEXT NOT NULL DEFAULT 'USER',
"status" BOOLEAN NOT NULL DEFAULT false,
"at_created" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
"at_updated" DATETIME NOT NULL
);
-- CreateTable
CREATE TABLE "Board" (
"uid" TEXT NOT NULL PRIMARY KEY,
"user_uid" TEXT NOT NULL,
"category" TEXT NOT NULL DEFAULT 'free',
"title" TEXT NOT NULL,
"content" TEXT NOT NULL,
"view" INTEGER NOT NULL DEFAULT 0,
"recommend" INTEGER NOT NULL DEFAULT 0,
"status" BOOLEAN NOT NULL DEFAULT false,
"at_created" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
"at_updated" DATETIME NOT NULL,
CONSTRAINT "Board_user_uid_fkey" FOREIGN KEY ("user_uid") REFERENCES "User" ("uid") ON DELETE RESTRICT ON UPDATE CASCADE
);
-- CreateIndex
CREATE UNIQUE INDEX "User_email_key" ON "User"("email");
-- CreateIndex
CREATE UNIQUE INDEX "User_userAuthToken_key" ON "User"("userAuthToken");

View File

@ -0,0 +1,3 @@
# Please do not edit this file manually
# It should be added in your version-control system (e.g., Git)
provider = "sqlite"

39
prisma/schema.prisma Normal file
View File

@ -0,0 +1,39 @@
// This is your Prisma schema file,
// learn more about it in the docs: https://pris.ly/d/prisma-schema
generator client {
provider = "prisma-client-js"
}
datasource db {
provider = "sqlite"
url = env("DATABASE_URL")
}
model User {
uid String @id @default(uuid()) //@default(autoincrement())
email String @unique
name String?
nickname String
passwordHash String
userAuthToken String @unique
role String @default("USER")
status Boolean @default(false)
at_created DateTime @default(now())
at_updated DateTime @updatedAt
boards Board[] // Board 모델과의 관계 추가
}
model Board {
uid String @id @default(uuid())
user_uid String
category String @default("free")
title String
content String
view Int @default(0)
recommend Int @default(0)
status Boolean @default(false)
at_created DateTime @default(now())
at_updated DateTime @updatedAt
author User @relation(fields: [user_uid], references: [uid])
}

30
prisma/seed.js Normal file
View File

@ -0,0 +1,30 @@
import { PrismaClient } from "@prisma/client";
import bcrypt from "bcrypt";
import { v4 as uuidv4 } from "uuid";
const prisma = new PrismaClient();
async function seed() {
try {
const adminUser = await prisma.user.create({
data: {
email: "admin@example.com",
nickname: "관리자",
name: "test",
passwordHash: await bcrypt.hash("password123", 10),
userAuthToken: uuidv4(),
role: "ADMIN",
status: true,
},
});
console.log("시드 데이터 생성됨:", adminUser);
} catch (error) {
console.error("시드 데이터 생성 중 오류 발생:", error);
process.exit(1);
} finally {
await prisma.$disconnect();
}
}
seed();

44
scrapper/jjang0u.ts Normal file
View File

@ -0,0 +1,44 @@
import { MyScrapper,BoardStruct } from "./myScrapper";
import * as cheerio from 'cheerio';
import dotenv from "dotenv";
export class Jjang0u extends MyScrapper {
constructor(debug: boolean = false) {
dotenv.config();
const config = {
host: process.env.jjang0u_HOST ?? "",
url: process.env.jjang0u_URL ?? "",
rows: process.env.jjang0u_ROWS ?? "",
title: process.env.jjang0u_TITLE ?? "",
nickname: process.env.jjang0u_NICKNAME ?? "",
content: process.env.jjang0u_CONTENT ?? ""
}
super(config, debug);
}
protected extractRow(row: BoardStruct, element: cheerio.Element, cnt: number): BoardStruct {
const $ = cheerio.load(element);
switch (cnt) {
case 0: //rownum 필요없음
break;
case 1: { //title
const title = $(this.config.title);
row.title = title.text();
//상세페이지 url 추출
row.detail_url = title.attr('href') ?? '';
break;
}
case 2: // nickname
row.nickname = $(this.config.nickname).text();
break;
case 3: // at_created (2025.03.03 -> 2025-03-03)
row.at_created = $(element).text().replace(/\./g, "-");
break;
case 4: // view (1,000 -> 1000)
row.view = parseInt($(element).text().replace(/,/g, ""));
break;
case 5: // recommend 필요없음
break
}
return row;
}
}

121
scrapper/myScrapper.ts Normal file
View File

@ -0,0 +1,121 @@
import axios from "axios";
import * as cheerio from "cheerio";
export interface BoardStruct {
uid?: number;
title: string;
nickname: string;
content: string;
view: number;
recommend?: number;
at_created: string;
detail_url: string;
}
export interface Config {
host: string;
url: string;
rows: string;
title: string;
nickname: string;
content: string;
}
export abstract class MyScrapper {
protected debug: boolean = false;
protected config: Config = {
host: "",
url: "",
rows: "",
title: "",
nickname: "",
content: ""
};
constructor(config:Config,debug: boolean = false) {
this.config = config;
this.debug = debug;
}
//추상 메서드
protected abstract extractRow(row: BoardStruct, element: cheerio.Element, cnt: number): BoardStruct;
private async getHTML(url: string): Promise<string | null> {
try {
const { data } = await axios.get(this.config.host+url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
if (this.debug) {
console.log("getHTML:"+ url + "=>길이:" + data.length);
}
return data;
} catch (error) {
console.error(`Failed to fetch ${url}:`, error);
return null;
}
}
private async extractDetail(url: string,selector: string): Promise<string> {
const html = await this.getHTML(url);
if (!html) return '';
const $ = cheerio.load(html);
return $(selector).html() ?? '';
}
private getListRow(element: cheerio.Cheerio): BoardStruct {
let cnt = 0;
let row: BoardStruct = {
title: '',
nickname: '',
content: '',
at_created: '',
view: 0,
recommend: 0,
detail_url: ''
};
//td 태그를 찾아서 각각의 데이터를 추출
element.find('td').toArray().forEach(tdElement => {
row = this.extractRow(row, tdElement, cnt);
++cnt;
});
if (this.debug) {
console.log("Row:", row);
}
return row;
}
private async getListRows(html:string, selector:string): Promise<BoardStruct[]> {
const $ = cheerio.load(html);
// HTML 구조 디버깅을 위한 코드 추가
if (this.debug) {
console.log("전체 HTML:", $.html().substring(0, 500)); // 처음 500자만 출력
console.log("검색된 요소 수:", $(selector).length);
// 페이지의 모든 div와 id 출력
$('div[id]').each((_, el) => {
console.log("발견된 div id:", $(el).attr('id'));
});
}
// 선택자로 요소 찾기
const rows = $(selector);
const lists: BoardStruct[] = [];
let i = 1;
rows.each((_, element) => {
lists.push(this.getListRow($(element)));
i++;
});
return lists;
}
public async run(): Promise<BoardStruct[]> {
const html = await this.getHTML(this.config.url);
if (!html) return [];
const rows = await this.getListRows(html, this.config.rows);
const lists: BoardStruct[] = [];
for (const row of rows) {
row.content = await this.extractDetail(row.detail_url,this.config.content);
lists.push(row);
}
return lists;
}
}

113
tsconfig.json Normal file
View File

@ -0,0 +1,113 @@
{
"compilerOptions": {
/* Visit https://aka.ms/tsconfig to read more about this file */
/* Projects */
// "incremental": true, /* Save .tsbuildinfo files to allow for incremental compilation of projects. */
// "composite": true, /* Enable constraints that allow a TypeScript project to be used with project references. */
// "tsBuildInfoFile": "./.tsbuildinfo", /* Specify the path to .tsbuildinfo incremental compilation file. */
// "disableSourceOfProjectReferenceRedirect": true, /* Disable preferring source files instead of declaration files when referencing composite projects. */
// "disableSolutionSearching": true, /* Opt a project out of multi-project reference checking when editing. */
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
/* Language and Environment */
"target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
// "jsx": "preserve", /* Specify what JSX code is generated. */
// "libReplacement": true, /* Enable lib replacement. */
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
// "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
// "jsxFragmentFactory": "", /* Specify the JSX Fragment reference used for fragments when targeting React JSX emit e.g. 'React.Fragment' or 'Fragment'. */
// "jsxImportSource": "", /* Specify module specifier used to import the JSX factory functions when using 'jsx: react-jsx*'. */
// "reactNamespace": "", /* Specify the object invoked for 'createElement'. This only applies when targeting 'react' JSX emit. */
// "noLib": true, /* Disable including any library files, including the default lib.d.ts. */
// "useDefineForClassFields": true, /* Emit ECMAScript-standard-compliant class fields. */
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
/* Modules */
"module": "commonjs", /* Specify what module code is generated. */
// "rootDir": "./", /* Specify the root folder within your source files. */
// "moduleResolution": "node10", /* Specify how TypeScript looks up a file from a given module specifier. */
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
// "typeRoots": [], /* Specify multiple folders that act like './node_modules/@types'. */
// "types": [], /* Specify type package names to be included without being referenced in a source file. */
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
// "moduleSuffixes": [], /* List of file name suffixes to search when resolving a module. */
// "allowImportingTsExtensions": true, /* Allow imports to include TypeScript file extensions. Requires '--moduleResolution bundler' and either '--noEmit' or '--emitDeclarationOnly' to be set. */
// "rewriteRelativeImportExtensions": true, /* Rewrite '.ts', '.tsx', '.mts', and '.cts' file extensions in relative import paths to their JavaScript equivalent in output files. */
// "resolvePackageJsonExports": true, /* Use the package.json 'exports' field when resolving package imports. */
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
// "noUncheckedSideEffectImports": true, /* Check side effect imports. */
// "resolveJsonModule": true, /* Enable importing .json files. */
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
/* JavaScript Support */
// "allowJs": true, /* Allow JavaScript files to be a part of your program. Use the 'checkJS' option to get errors from these files. */
// "checkJs": true, /* Enable error reporting in type-checked JavaScript files. */
// "maxNodeModuleJsDepth": 1, /* Specify the maximum folder depth used for checking JavaScript files from 'node_modules'. Only applicable with 'allowJs'. */
/* Emit */
// "declaration": true, /* Generate .d.ts files from TypeScript and JavaScript files in your project. */
// "declarationMap": true, /* Create sourcemaps for d.ts files. */
// "emitDeclarationOnly": true, /* Only output d.ts files and not JavaScript files. */
// "sourceMap": true, /* Create source map files for emitted JavaScript files. */
// "inlineSourceMap": true, /* Include sourcemap files inside the emitted JavaScript. */
// "noEmit": true, /* Disable emitting files from a compilation. */
// "outFile": "./", /* Specify a file that bundles all outputs into one JavaScript file. If 'declaration' is true, also designates a file that bundles all .d.ts output. */
// "outDir": "./", /* Specify an output folder for all emitted files. */
// "removeComments": true, /* Disable emitting comments. */
// "importHelpers": true, /* Allow importing helper functions from tslib once per project, instead of including them per-file. */
// "downlevelIteration": true, /* Emit more compliant, but verbose and less performant JavaScript for iteration. */
// "sourceRoot": "", /* Specify the root path for debuggers to find the reference source code. */
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
// "inlineSources": true, /* Include source code in the sourcemaps inside the emitted JavaScript. */
// "emitBOM": true, /* Emit a UTF-8 Byte Order Mark (BOM) in the beginning of output files. */
// "newLine": "crlf", /* Set the newline character for emitting files. */
// "stripInternal": true, /* Disable emitting declarations that have '@internal' in their JSDoc comments. */
// "noEmitHelpers": true, /* Disable generating custom helper functions like '__extends' in compiled output. */
// "noEmitOnError": true, /* Disable emitting files if any type checking errors are reported. */
// "preserveConstEnums": true, /* Disable erasing 'const enum' declarations in generated code. */
// "declarationDir": "./", /* Specify the output directory for generated declaration files. */
/* Interop Constraints */
// "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
// "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
// "isolatedDeclarations": true, /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */
// "erasableSyntaxOnly": true, /* Do not allow runtime constructs that are not part of ECMAScript. */
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
/* Type Checking */
"strict": true, /* Enable all strict type-checking options. */
// "noImplicitAny": true, /* Enable error reporting for expressions and declarations with an implied 'any' type. */
// "strictNullChecks": true, /* When type checking, take into account 'null' and 'undefined'. */
// "strictFunctionTypes": true, /* When assigning functions, check to ensure parameters and the return values are subtype-compatible. */
// "strictBindCallApply": true, /* Check that the arguments for 'bind', 'call', and 'apply' methods match the original function. */
// "strictPropertyInitialization": true, /* Check for class properties that are declared but not set in the constructor. */
// "strictBuiltinIteratorReturn": true, /* Built-in iterators are instantiated with a 'TReturn' type of 'undefined' instead of 'any'. */
// "noImplicitThis": true, /* Enable error reporting when 'this' is given the type 'any'. */
// "useUnknownInCatchVariables": true, /* Default catch clause variables as 'unknown' instead of 'any'. */
// "alwaysStrict": true, /* Ensure 'use strict' is always emitted. */
// "noUnusedLocals": true, /* Enable error reporting when local variables aren't read. */
// "noUnusedParameters": true, /* Raise an error when a function parameter isn't read. */
// "exactOptionalPropertyTypes": true, /* Interpret optional property types as written, rather than adding 'undefined'. */
// "noImplicitReturns": true, /* Enable error reporting for codepaths that do not explicitly return in a function. */
// "noFallthroughCasesInSwitch": true, /* Enable error reporting for fallthrough cases in switch statements. */
// "noUncheckedIndexedAccess": true, /* Add 'undefined' to a type when accessed using an index. */
// "noImplicitOverride": true, /* Ensure overriding members in derived classes are marked with an override modifier. */
// "noPropertyAccessFromIndexSignature": true, /* Enforces using indexed accessors for keys declared using an indexed type. */
// "allowUnusedLabels": true, /* Disable error reporting for unused labels. */
// "allowUnreachableCode": true, /* Disable error reporting for unreachable code. */
/* Completeness */
// "skipDefaultLibCheck": true, /* Skip type checking .d.ts files that are included with TypeScript. */
"skipLibCheck": true /* Skip type checking all .d.ts files. */
}
}