技术文献

Node.js采集项目

2026-06-17
import axios from "axios";
import * as cheerio from "cheerio";
import { prisma } from "../lib/prisma.ts";
// =========================
// 可配置
// =========================
const START_PAGE = 102;
const END_PAGE = 151;
const BASE_URL =
   "https://www.maolian.net/zhinan/cheliangjiaotong/cheliangguohu";
// 固定业务ID
const AUTHOR_ID = "cmpjqwr1l0000vu3gl5stromv";
const CATEGORY_ID = "cmpl2pxzc0004vuxoysram4xh";
const TAG_ID = "cmpjqwr44000cvu3gts229cdm";
// =========================
// 请求封装(防反爬)
// =========================
async function request(url: string) {
   return axios.get(url, {
       headers: {
           "User-Agent":
               "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
           "Accept-Language": "zh-CN,zh;q=0.9",
       },
       timeout: 15000,
   });
}
// =========================
// 列表页抓取(核心修复)
// =========================
async function fetchList(page: number) {
   const url = `${BASE_URL}/page_${page}.aspx`;
   const res = await request(url);
   const $ = cheerio.load(res.data);
   const list: any[] = [];
   // =========================
   // ✅ 兼容 PC + 移动端
   // =========================
   const selectors = ["ul.tw-list li", "ul.con-list li"];
   let usedSelector = "";
   for (const sel of selectors) {
       if ($(sel).length > 0) {
           usedSelector = sel;
           break;
       }
   }
   console.log(`📌 page ${page} 使用 selector: ${usedSelector}`);
   if (!usedSelector) return [];
   $(usedSelector).each((_, el) => {
       const a =
           $(el).find("h3 a").length > 0
               ? $(el).find("h3 a")
               : $(el).find("a").first();
       const title = a.text().trim();
       let link = a.attr("href")?.trim() || "";
       const excerpt =
           $(el).find("p").text().trim() ||
           $(el).find(".tj-nr").text().trim();
       const date =
           $(el).find(".info span").first().text().trim() || "";
       // =========================
       // URL 修复(关键)
       // =========================
       if (link.startsWith("//m.")) {
           link = "https:" + link;
       } else if (link.startsWith("/")) {
           link = "https://www.66law.cn" + link;
       }
       if (title && link) {
           list.push({ title, link, excerpt, date });
       }
   });
   return list;
}
// =========================
// 详情页
// =========================
async function fetchDetail(url: string) {
   const res = await request(url);
   const $ = cheerio.load(res.data);

   const title = $(".det-title h1").text().trim();
   const date = $(".det-title .info span").first().text().trim();

   const $content = $(".det-nr");
   // 1️⃣ 删除 a 和 img 标签
  // $content.find("a").remove();
   // 去掉 a 标签,只保留里面文本
   $content.find("a").each((_, el) => {
       const text = $(el).text();      // 获取 <a> 的文本
       $(el).replaceWith(text);        // 用文本替换 <a> 标签
   });
   $content.find("img").remove();
   // 2️⃣ 删除所有 style / target / title 属性
   $content.find("*").each((_, el) => {
       $(el).removeAttr("style");
       $(el).removeAttr("target");
       $(el).removeAttr("title");
   });
   // 3️⃣ 获取清洗后的 HTML
   const content = $content.html()?.trim() || "";
   return { title, date, content };
}
// =========================
// slug
// =========================
function createSlug() {
   return `post-${Date.now()}-${Math.random()
       .toString(36)
       .slice(2, 6)}`;
}
// =========================
// 入库
// =========================
async function savePost(item: any) {
   const exists = await prisma.post.findFirst({
       where: { title: item.title },
   });
   if (exists) {
       console.log("⏭ 已存在:", item.title);
       return;
   }
   const post = await prisma.post.create({
       data: {
           title: item.title,
           slug: createSlug(item.title),
           content: item.content,
           excerpt: item.excerpt,
           published: true,
           authorId: AUTHOR_ID,
           categoryId: CATEGORY_ID,
           tags: {
               create: [
                   {
                       tag: {
                           connect: { id: TAG_ID },
                       },
                   },
               ],
           },
       },
   });
   console.log("✅ 入库:", post.title);
}
// =========================
// 主流程
// =========================
async function main() {
   try {
       for (let page = START_PAGE; page <= END_PAGE; page++) {
           console.log(`\n📄 正在抓第 ${page} 页...`);
           const list = await fetchList(page);
           console.log(`抓到 ${list.length} 条`);
           if (list.length === 0) {
               console.log("⚠️ 该页无数据,跳过");
               continue;
           }
           for (const item of list) {
               try {
                   const detail = await fetchDetail(item.link);
                   await savePost({
                       title: detail.title || item.title,
                       excerpt: item.excerpt,
                       date: detail.date,
                       content: detail.content,
                   });
               } catch (err) {
                   console.log("❌ 详情失败:", item.link);
               }
           }
       }
       console.log("\n🎉 全部完成");
   } catch (err) {
       console.error("❌ 爬虫崩了:", err);
   } finally {
       await prisma.$disconnect();
   }
}
main();

 

//文件位置:crawler/test.ts是批量采集
运行:
npx tsx crawler/test.ts

prisma.ts

import { PrismaClient } from "@prisma/client";
const globalForPrisma = globalThis as unknown as {
 prisma: PrismaClient | undefined;
};
export const prisma = globalForPrisma.prisma ?? new PrismaClient();
if (process.env.NODE_ENV !== "production") globalForPrisma.prisma = prisma;