导航菜单

  • 0.api
  • 0.Async
  • 0.module
  • 1.ES2015
  • 2.Promise
  • 3.Node
  • 4.NodeInstall
  • 5.REPL
  • 6.NodeCore
  • 7.module&NPM
  • 8.Encoding
  • 9.Buffer
  • 10.fs
  • 11.Stream-1
  • 11.Stream-2
  • 11.Stream-3
  • 11.Stream-4
  • 12-Network-2
  • 12.NetWork-3
  • 12.Network-1
  • 13.tcp
  • 14.http-1
  • 14.http-2
  • 15.compress
  • 16.crypto
  • 17.process
  • 18.yargs
  • 19.cache
  • 20.action
  • 21.https
  • 22.cookie
  • 23.session
  • 24.express-1
  • 24.express-2
  • 24.express-3
  • 24.express-4
  • 25.koa-1
  • 26.webpack-1-basic
  • 26.webpack-2-optimize
  • 26.webpack-3-file
  • 26.webpack-4.tapable
  • 26.webpack-5-AST
  • 26.webpack-6-sources
  • 26.webpack-7-loader
  • 26.webpack-8-plugin
  • 26.webpack-9-hand
  • 26.webpack-10-prepare
  • 28.redux
  • 28.redux-jwt-back
  • 28.redux-jwt-front
  • 29.mongodb-1
  • 29.mongodb-2
  • 29.mongodb-3
  • 29.mongodb-4
  • 29.mongodb-5
  • 29.mongodb-6
  • 30.cms-1-mysql
  • 30.cms-2-mysql
  • 30.cms-3-mysql
  • 30.cms-4-nunjucks
  • 30.cms-5-mock
  • 30.cms-6-egg
  • 30.cms-7-api
  • 30.cms-8-roadhog
  • 30.cms-9-yaml
  • 30.cms-10-umi
  • 30.cms-12-dva
  • 30.cms-13-dva-ant
  • 30.cms-14-front
  • 30.cms-15-deploy
  • 31.dva
  • 31.cms-13-dva-antdesign
  • 33.redis
  • 34.unittest
  • 35.jwt
  • 36.websocket-1
  • 36.websocket-2
  • 38.chat-api-1
  • 38.chat-api-2
  • 38.chat-3
  • 38.chat-api-3
  • 38.chat
  • 38.chat2
  • 38.chat2
  • 39.crawl-0
  • 39.crawl-1
  • 39.crawl-2
  • 40.deploy
  • 41.safe
  • 42.test
  • 43.nginx
  • 44.enzyme
  • 45.docker
  • 46.elastic
  • 47.oauth
  • 48.wxpay
  • index
  • 52.UML
  • 53.design
  • index
  • 54.linux
  • 57.ts
  • 56.react-ssr
  • 58.ts_react
  • 59.ketang
  • 59.ketang2
  • 61.1.devops-linux
  • 61.2.devops-vi
  • 61.3.devops-user
  • 61.4.devops-auth
  • 61.5.devops-shell
  • 61.6.devops-install
  • 61.7.devops-system
  • 61.8.devops-service
  • 61.9.devops-network
  • 61.10.devops-nginx
  • 61.11.devops-docker
  • 61.12.devops-jekins
  • 61.13.devops-groovy
  • 61.14.devops-php
  • 61.15.devops-java
  • 61.16.devops-node
  • 61.17.devops-k8s
  • 62.1.react-basic
  • 62.2.react-state
  • 62.3.react-high
  • 62.4.react-optimize
  • 62.5.react-hooks
  • 62.6.react-immutable
  • 62.7.react-mobx
  • 62.8.react-source
  • 63.1.redux
  • 63.2.redux-middleware
  • 63.3.redux-hooks
  • 63.4.redux-saga
  • 63.5.redux-saga-hand
  • 64.1.router
  • 64.2.router-connected
  • 65.1.typescript
  • 65.2.typescript
  • 65.3.typescript
  • 65.4.antd
  • 65.4.definition
  • 66-1.vue-base
  • 66-2.vue-component
  • 66-3.vue-cli3.0
  • 66-4.$message组件
  • 66-5.Form组件
  • 66-6.tree
  • 66-7.vue-router-apply
  • 66-8.axios-apply
  • 66-9.vuex-apply
  • 66-10.jwt-vue
  • 66-11.vue-ssr
  • 66-12.nuxt-apply
  • 66-13.pwa
  • 66-14.vue单元测试
  • 66-15.权限校验
  • 67-1-network
  • 68-2-wireshark
  • 7.npm2
  • 69-hooks
  • 70-deploy
  • 71-hmr
  • 72.deploy
  • 73.import
  • 74.mobile
  • 75.webpack-1.文件分析
  • 75.webpack-2.loader
  • 75.webpack-3.源码流程
  • 75.webpack-4.tapable
  • 75.webpack-5.prepare
  • 75.webpack-6.resolve
  • 75.webpack-7.loader
  • 75.webpack-8.module
  • 75.webpack-9.chunk
  • 75.webpack-10.asset
  • 75.webpack-11.实现
  • 76.react_optimize
  • 77.ts_ketang_back
  • 77.ts_ketang_front
  • 78.vue-domdiff
  • 79.grammar
  • 80.tree
  • 81.axios
  • 82.1.react
  • 82.2.react-high
  • 82.3.react-router
  • 82.4.redux
  • 82.5.redux_middleware
  • 82.6.connected
  • 82.7.saga
  • 82.8.dva
  • 82.8.dva-source
  • 82.9.roadhog
  • 82.10.umi
  • 82.11.antdesign
  • 82.12.ketang-front
  • 82.12.ketang-back
  • 83.upload
  • 84.graphql
  • 85.antpro
  • 86.1.uml
  • 86.2.design
  • 87.postcss
  • 88.react16-1
  • 89.nextjs
  • 90.react-test
  • 91.react-ts
  • 92.rbac
  • 93.tsnode
  • 94.1.JavaScript
  • 94.2.JavaScript
  • 94.3.MODULE
  • 94.4.EventLoop
  • 94.5.文件上传
  • 94.6.https
  • 94.7. nginx
  • 95.1. react
  • 95.2.react
  • 96.1.react16
  • 96.2.fiber
  • 96.3.fiber
  • 97.serverless
  • 98.websocket
  • 100.1.react-basic
  • 101.1.monitor
  • 101.2.monitor
  • 102.java
  • 103.1.webpack-usage
  • 103.2.webpack-bundle
  • 103.3.webpack-ast
  • 103.4.webpack-flow
  • 103.5.webpack-loader
  • 103.6.webpack-tapable
  • 103.7.webpack-plugin
  • 103.8.webpack-optimize1
  • 103.9.webpack-optimize2
  • 103.10.webpack-hand
  • 103.11.webpack-hmr
  • 103.11.webpack5
  • 103.13.splitChunks
  • 103.14.webpack-sourcemap
  • 103.15.webpack-compiler1
  • 103.15.webpack-compiler2
  • 103.16.rollup.1
  • 103.16.rollup.2
  • 103.16.rollup.3
  • 103.16.vite.basic
  • 103.16.vite.source
  • 103.16.vite.plugin
  • 103.16.vite.1
  • 103.16.vite.2
  • 103.17.polyfill
  • 104.1.binary
  • 104.2.binary
  • 105.skeleton
  • 106.1.react
  • 106.2.react_hooks
  • 106.3.react_router
  • 106.4.redux
  • 106.5.redux_middleware
  • 106.6.connected-react-router
  • 106.6.redux-first-history
  • 106.7.redux-saga
  • 106.8.dva
  • 106.9.umi
  • 106.10.ketang
  • 106.11.antdesign
  • 106.12.antpro
  • 106.13.router-6
  • 106.14.ssr
  • 106.15.nextjs
  • 106.16.1.cms
  • 106.16.2.cms
  • 106.16.3.cms
  • 106.16.4.cms
  • 106.16.mobx
  • 106.17.fomily
  • 107.fiber
  • 108.http
  • 109.1.webpack_usage
  • 109.2.webpack_source
  • 109.3.dll
  • 110.nest.js
  • 111.xstate
  • 112.Form
  • 113.redux-saga
  • 114.react+typescript
  • 115.immer
  • 116.pro5
  • 117.css-loader
  • 118.1.umi-core
  • 119.2.module-federation
  • 119.1.module-federation
  • 120.create-react-app
  • 121.react-scripts
  • 122.react-optimize
  • 123.jsx-runtime
  • 124.next.js
  • 125.1.linux
  • 125.2.linux-vi
  • 125.3.linux-user
  • 125.4.linux-auth
  • 125.5.linux-shell
  • 125.6.linux-install
  • 125.7.linux-system
  • 125.8.linux-service
  • 125.9.linux-network
  • 125.10.nginx
  • 125.11.docker
  • 125.12.ci
  • 125.13.k8s
  • 125.14.k8s
  • 125.15.k8s
  • 125.16.k8s
  • 126.11.react-1
  • 126.12.react-2
  • 126.12.react-3
  • 126.12.react-4
  • 126.12.react-5
  • 126.12.react-6
  • 126.12.react-7
  • 126.12.react-8
  • 127.frontend
  • 128.rollup
  • 129.px2rem-loader
  • 130.health
  • 131.hooks
  • 132.keepalive
  • 133.vue-cli
  • 134.react18
  • 134.2.react18
  • 134.3.react18
  • 135.function
  • 136.toolkit
  • 137.lerna
  • 138.create-vite
  • 139.cli
  • 140.antd
  • 141.react-dnd
  • 142.1.link
  • 143.1.gulp
  • 143.2.stream
  • 143.3.gulp
  • 144.1.closure
  • 144.2.v8
  • 144.3.gc
  • 145.react-router-v6
  • 146.browser
  • 147.lighthouse
  • 148.1.basic
  • 148.2.basic
  • 148.3.basic
  • 148.4.basic
  • 148.5.basic
  • 149.1.vite
  • 149.2.vite
  • 149.3.vite
  • 149.4.vite
  • 150.react-window
  • 151.react-query
  • 152.useRequest
  • 153.transition
  • 154.emotion
  • 155.1.formily
  • 155.2.formily
  • 155.3.formily
  • 155.3.1.mobx.usage
  • 155.3.2.mobx.source
  • 156.vue-loader
  • 103.11.mf
  • 157.1.react18
  • 158.umi4
  • 159.rxjs
  • 159.rxjs2
  • 160.bff
  • 161.zustand
  • 162.vscode
  • 163.emp
  • 164.cors
  • 1. 爬取内容
    • 1. http/axios 等爬取API接口
    • 2. superagent/request/crawl爬取HTML页面
    • 3. 使用puppeteer控制chromium
  • 2.数据持久化
  • 3. 数据订阅
  • 4. 分发
  • 参考

1. 爬取内容 #

1. http/axios 等爬取API接口 #

let axios = require('axios');
axios.get('https://follow-api-ms.juejin.im/v1/getUserFollowInfo?uid=551d6923e4b0cd5b623f54da&src=web')
    .then(res => console.log(res.data))

2. superagent/request/crawl爬取HTML页面 #

let request = require('request');
request('https://juejin.im/tag/%E5%89%8D%E7%AB%AF', (err, response, body) => {
    let regexp = /class="title" data-v-\w+>(.+?)<\/a>/g;
    let titles = [];
    body.replace(regexp, (matched, title) => {
        titles.push(title);
    });
    console.log(titles);
});

3. 使用puppeteer控制chromium #

  • puppeteer是Chrome团队开发的一个node库
  • 可以通过api来控制浏览器的行为,比如点击,跳转,刷新,在控制台执行js脚本等等
  • 通过这个工具可以用来写爬虫,自动签到,网页截图,生成pdf,自动化测试等
(async () => {
     const browser = await puppeteer.launch();//打开浏览器
     const page = await browser.newPage();//打开一个空白页
     await page.goto('https://www.baidu.com');//在地址栏输入网址并等待加载
     await page.screenshot({ path: 'baidu.png' });//截个图
     await browser.close();//关掉浏览器
})();
const puppeteer=require('puppeteer');
const fs=require('fs');
(async function () {
        const browser=await puppeteer.launch({headless:false});
        const page=await browser.newPage();
        await page.goto('https://juejin.im/tag/%E5%89%8D%E7%AB%AF', {
waitUntil: 'networkidle2'
        });
        await page.waitFor(500);
        let comments = await page.$$eval('a.title', els => {
return els.map(item => item.innerText);
        });
        fs.writeFileSync('comments.txt',comments.join('\r\n'),'utf8');
        await browser.close();
})();
const puppeteer=require('puppeteer');
(async function () {
    const browser=await puppeteer.launch({headless:false});//启动浏览器
    let page = await browser.newPage();//创建一个 Page 实例
    await page.setJavaScriptEnabled(true);//启用javascript
    await page.goto("https://www.jd.com/");
    const searchInput = await page.$("#key");//获取元素
    await searchInput.focus(); //定位到搜索框
    await page.keyboard.type("手机");//输入手机
    const searchBtn = await page.$(".button");
    await searchBtn.click();
    await page.waitForSelector('.gl-item'); //等待元素加载之后,否则获取不了异步加载的元素
    const links = await page.$$eval('.gl-item > .gl-i-wrap > .p-img > a', links => {
        return links.map(a => {
            return {
                href: a.href.trim(),
                title: a.title
            }
        });
    });
    page.close();
    const aTags = links.splice(0, 1);
    for (var i = 0; i < aTags.length; i++) {
        page=await browser.newPage();
        page.setJavaScriptEnabled(true);
        await page.setViewport({//修改浏览器视窗大小
            width: 1920,
            height: 1080
        });
        var a = aTags[i];
        await page.goto(a.href, {timeout: 0});
        let filename = "items-" + i + ".png";
        await page.screenshot({
            path: filename,
            fullPage: true
        });
        page.close();
    }
    browser.close();
})();

2.数据持久化 #

  • 根据爬取的规则和策略,把爬取到的数据储到数据库中
  • 如果要兼容不同的来源,需要对数据需要格式化
  • 为不同的数据建立索引方便检索

3. 数据订阅 #

  • 用户可以按照自己的兴趣和需要进行订制内容

4. 分发 #

有新的数据的时候

  • 可以使用邮件推送到订阅者
  • 可以使用极光推送等推送服务
  • 可以使用及时通信服务向客户端推送

参考 #

  • axios
  • request
  • puppeteer
  • puppeteer api
  • showapi

访问验证

请输入访问令牌

Token不正确,请重新输入