import axios from "axios";
import * as cheerio from "cheerio";
import { prisma } from "../lib/prisma.ts";
// =========================
// 可配置
// =========================
const START_PAGE = 102;
const END_PAGE = 151;
const BASE_URL =
"https://www.maolian.net/zhinan/cheliangjiaotong/cheliangguohu";
// 固定业务ID
const AUTHOR_ID = "cmpjqwr1l0000vu3gl5stromv";
const CATEGORY_ID = "cmpl2pxzc0004vuxoysram4xh";
const TAG_ID = "cmpjqwr44000cvu3gts229cdm";
// =========================
// 请求封装(防反爬)
// =========================
async function request(url: string) {
return axios.get(url, {
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept-Language": "zh-CN,zh;q=0.9",
},
timeout: 15000,
});
}
// =========================
// 列表页抓取(核心修复)
// =========================
async function fetchList(page: number) {
const url = `${BASE_URL}/page_${page}.aspx`;
const res = await request(url);
const $ = cheerio.load(res.data);
const list: any[] = [];
// =========================
// ✅ 兼容 PC + 移动端
// =========================
const selectors = ["ul.tw-list li", "ul.con-list li"];
let usedSelector = "";
for (const sel of selectors) {
if ($(sel).length > 0) {
usedSelector = sel;
break;
}
}
console.log(`📌 page ${page} 使用 selector: ${usedSelector}`);
if (!usedSelector) return [];
$(usedSelector).each((_, el) => {
const a =
$(el).find("h3 a").length > 0
? $(el).find("h3 a")
: $(el).find("a").first();
const title = a.text().trim();
let link = a.attr("href")?.trim() || "";
const excerpt =
$(el).find("p").text().trim() ||
$(el).find(".tj-nr").text().trim();
const date =
$(el).find(".info span").first().text().trim() || "";
// =========================
// URL 修复(关键)
// =========================
if (link.startsWith("//m.")) {
link = "https:" + link;
} else if (link.startsWith("/")) {
link = "https://www.66law.cn" + link;
}
if (title && link) {
list.push({ title, link, excerpt, date });
}
});
return list;
}
// =========================
// 详情页
// =========================
async function fetchDetail(url: string) {
const res = await request(url);
const $ = cheerio.load(res.data);
const title = $(".det-title h1").text().trim();
const date = $(".det-title .info span").first().text().trim();
const $content = $(".det-nr");
// 1️⃣ 删除 a 和 img 标签
// $content.find("a").remove();
// 去掉 a 标签,只保留里面文本
$content.find("a").each((_, el) => {
const text = $(el).text(); // 获取 <a> 的文本
$(el).replaceWith(text); // 用文本替换 <a> 标签
});
$content.find("img").remove();
// 2️⃣ 删除所有 style / target / title 属性
$content.find("*").each((_, el) => {
$(el).removeAttr("style");
$(el).removeAttr("target");
$(el).removeAttr("title");
});
// 3️⃣ 获取清洗后的 HTML
const content = $content.html()?.trim() || "";
return { title, date, content };
}
// =========================
// slug
// =========================
function createSlug() {
return `post-${Date.now()}-${Math.random()
.toString(36)
.slice(2, 6)}`;
}
// =========================
// 入库
// =========================
async function savePost(item: any) {
const exists = await prisma.post.findFirst({
where: { title: item.title },
});
if (exists) {
console.log("⏭ 已存在:", item.title);
return;
}
const post = await prisma.post.create({
data: {
title: item.title,
slug: createSlug(item.title),
content: item.content,
excerpt: item.excerpt,
published: true,
authorId: AUTHOR_ID,
categoryId: CATEGORY_ID,
tags: {
create: [
{
tag: {
connect: { id: TAG_ID },
},
},
],
},
},
});
console.log("✅ 入库:", post.title);
}
// =========================
// 主流程
// =========================
async function main() {
try {
for (let page = START_PAGE; page <= END_PAGE; page++) {
console.log(`\n📄 正在抓第 ${page} 页...`);
const list = await fetchList(page);
console.log(`抓到 ${list.length} 条`);
if (list.length === 0) {
console.log("⚠️ 该页无数据,跳过");
continue;
}
for (const item of list) {
try {
const detail = await fetchDetail(item.link);
await savePost({
title: detail.title || item.title,
excerpt: item.excerpt,
date: detail.date,
content: detail.content,
});
} catch (err) {
console.log("❌ 详情失败:", item.link);
}
}
}
console.log("\n🎉 全部完成");
} catch (err) {
console.error("❌ 爬虫崩了:", err);
} finally {
await prisma.$disconnect();
}
}
main();
//文件位置:crawler/test.ts是批量采集
运行:
npx tsx crawler/test.ts
prisma.ts
import { PrismaClient } from "@prisma/client";
const globalForPrisma = globalThis as unknown as {
prisma: PrismaClient | undefined;
};
export const prisma = globalForPrisma.prisma ?? new PrismaClient();
if (process.env.NODE_ENV !== "production") globalForPrisma.prisma = prisma;
