编程开源技术交流,分享技术与知识

网站首页 > 开源技术 正文

利用puppeteer采集自己想要的书籍

wxchong 2024-06-17 22:31:16 开源技术 13 ℃ 0 评论

安装node


cd /usr/local/src/
wget http://nodejs.org/dist/v0.10.24/node-v0.10.24.tar.gz
tar zxvf node-v0.10.24.tar.gz
cd node-v0.10.24
./configure --prefix=/usr/local/node/0.10.24
make
make install
node -v

安装完node之后npm就安装好了

vagrant@homestead:~/code/Ecc3.0_System01$ npm -v

切换cnpm国内源


╰$ npm install -g cnpm --registry=https://registry.npm.taobao.org

安装puppeteer


╰$ cnpm i puppeteer

测试脚本


const puppeteer = require('puppeteer');  

(async () => {
    try {
        const browser = await puppeteer.launch();
        const page = await browser.newPage();
        await page.goto('https://www.baidu.com/');
    
        // 等待时间
        await page.waitFor(1000*2);
    
        // 截图
        await page.screenshot({path:'./output/baidu.png',fullPage:true});
    
        // pdf
        await page.pdf({path:'./output/baidu.pdf',format:"A4",printBackground:true});
    
        // 输入内容
        await page.type('#kw','Python',{delay:true});
        // 触发内容
        await page.click('#su');
    
        // 等待时间
        await page.waitFor(1000*5);
    
        await page.setViewport({
            width:1920,
            height:1080
        });
    
        // 截图
        await page.screenshot({path:'./output/baidu_python.png',fullPage:true});
    
        // pdf
        await page.pdf({path:'./output/baidu_python.pdf',format:"a4",printBackground:true});
    
        await browser.close();
    } catch (error) {
        console.log(`this is the ${error}`);

    }       
})(); 

结果


批量下载 Es6文档


const puppeteer = require("puppeteer");

(async() => {     
    try {
        const browser = await puppeteer.launch({
            headless:true,
            args: [
                '–disable-gpu', // GPU硬件加速
                '–disable-dev-shm-usage', // 创建临时文件共享内存
                '–disable-setuid-sandbox', // uid沙盒
                '–no-first-run', // 没有设置首页。在启动的时候,就会打开一个空白页面。
                '–no-sandbox', // 沙盒模式
                '–no-zygote',
                '–single-process' // 单进程运行         
            ]
        });  
        const page = await browser.newPage();

        await page.goto('http://es6.ruanyifeng.com/#README',{
            'timeout': 0 //无限大
        });                             
        let aTags = await (await page).evaluate(() => {
            let as = [...document.querySelectorAll('ol li a')];
            return as.map((a) =>{
                return {
                  href: a.href.trim(),
                  name: a.text
                }
            });        
        });
        // console.log(aTags)    
        // await page.pdf({path: `./output/${aTags[0].name}.pdf`,format:'a4',printBackground:true});  
    
        for (var i = 1; i < aTags.length;i++){
            pageS = await browser.newPage();
            var a = aTags[i];
            console.log("完成个数:"+i);                  
            await pageS.goto(a.href,{'timeout': 0});      
            await pageS.pdf({path: `/Users/shiyuxiang/develop/www/Ecc3.0_System01/output/${a.name}.pdf`,format:'a4'});
            pageS.close();  
        }               
        console.log("完成") 
        browser.close();              
    } catch (err) {
        console.log(`this is the ${err}`);
    }                   
})();

执行


╰$ node crawl.js

结果


将PDF可并到一块

╰$ sudo apt-get install pdftk

╰$ cnpm i pdf-merge

脚本

const PDFMerge = require('pdf-merge');
const path = require('path');
const fs = require('fs');

/**
 * @desc 返回路径
 * @param {String} dir, dir2 字符串
 * @return {String} 路径
 */
function resolve(dir, dir2 = ''){
	return path.posix.join(__dirname, './', dir, dir2);
}

// 配置
const config = {       
	entry: './output/',
	output: './data/'     
};   

// 
const filenameArr = fs.readdirSync(resolve(config.entry));

const sortedFilenameArr = filenameArr.sort((str1, str2) => {
	let regex = /^(\d{1,2})\./;
	let a = +str1.match(regex);
	let b = +str2.match(regex);
	return a - b;
});

// console.log(sortedFilenameArr);

const files = sortedFilenameArr.map((el) => {
	return resolve(`${config.entry}${el}`);
});

console.log('files', files);   

const outputPath = resolve(config.output);

const isExists = fs.existsSync(outputPath);

console.log('isExists', isExists, 'outputPath', outputPath);

/**
 * @desc 创建输出路径
 */
function mkdirOutputpath(){  
	try{
		fs.mkdirSync(outputPath);
		console.log('mkdir is successful!');
	} catch(e){
		console.log('mkdir is failed!', e);
	}
};
// 如果不存在 则创建
if(!isExists){
	mkdirOutputpath();
}

console.log('let\'s start merge...');

const filename = `ES6 入门教程-${Date.now()}.pdf`;    

// console.log(filename);   


const output = resolve(`${config.output}${filename}`);

// console.log(output);

// Save as new file
PDFMerge(files, {         
	output: output,
})
.then((buffer) => {
	console.log('merge is successful!');
});

结果

Tags:

本文暂时没有评论,来添加一个吧(●'◡'●)

欢迎 发表评论:

最近发表
标签列表