编程开源技术交流,分享技术与知识

网站首页 > 开源技术 正文

Java版的Puppeteer爬虫框架

wxchong 2024-06-17 22:31:39 开源技术 12 ℃ 0 评论

github地址:https://github.com/fanyong920/jvppeteer


快速开始

自动下载最新chromium并启动:

Bash
package com.ruiyun.example;

import com.ruiyun.jvppeteer.core.Puppeteer;
import com.ruiyun.jvppeteer.core.browser.Browser;
import com.ruiyun.jvppeteer.core.browser.BrowserFetcher;

import java.io.IOException;
import java.util.concurrent.ExecutionException;

/**
 * 展示下载最新的chromuim浏览器的例子
 */
public class DownloadChromiumExample2 {
    public static void main(String[] args) throws IOException, InterruptedException, ExecutionException {

        Puppeteer puppeteer = new Puppeteer();
        //创建下载实例
        BrowserFetcher browserFetcher = puppeteer.createBrowserFetcher();
        //下载最新版本的chromuim
        browserFetcher.download();
        Browser browser = Puppeteer.launch(false);
        String version = browser.version();
        System.out.println(version);
    }
}

爬取整个页面的内容:

Bash
package com.ruiyun.example;

import com.ruiyun.jvppeteer.core.Puppeteer;
import com.ruiyun.jvppeteer.options.LaunchOptions;
import com.ruiyun.jvppeteer.options.OptionsBuilder;
import com.ruiyun.jvppeteer.core.browser.Browser;
import com.ruiyun.jvppeteer.core.page.Page;

import java.io.IOException;
import java.util.ArrayList;

public class PageContentExample {

    public static void main(String[] args) throws InterruptedException, IOException {
        String path = new String("F:\\java教程\\49期\\vuejs\\puppeteer\\.local-chromium\\win64-722234\\chrome-win\\chrome.exe".getBytes(),"UTF-8");

//       String  path ="D:\\develop\\project\\toString\\chrome-win\\chrome.exe";
        ArrayList<String> arrayList = new ArrayList<>();
        LaunchOptions options = new OptionsBuilder().withArgs(arrayList).withHeadless(false).withExecutablePath(path).build();
        arrayList.add("--no-sandbox");
        arrayList.add("--disable-setuid-sandbox");
        Browser browser = Puppeteer.launch(options);
        Page page = browser.newPage();
        page.goTo("https://www.baidu.com/?tn=98012088_10_dg&ch=3");
        String content = page.content();
        System.out.println("=======================content=============="+content);
    }
}

截图

package com.ruiyun.example;

import com.ruiyun.jvppeteer.core.Puppeteer;
import com.ruiyun.jvppeteer.core.browser.Browser;
import com.ruiyun.jvppeteer.core.page.Page;
import com.ruiyun.jvppeteer.options.Clip;
import com.ruiyun.jvppeteer.options.LaunchOptions;
import com.ruiyun.jvppeteer.options.OptionsBuilder;
import com.ruiyun.jvppeteer.options.ScreenshotOptions;

import java.util.ArrayList;

public class PagescreenshotExample {

    public static void main(String[] args) throws Exception {
       // String path = new String("F:\\java教程\\49期\\vuejs\\puppeteer\\.local-chromium\\win64-722234\\chrome-win\\chrome.exe".getBytes(),"UTF-8");
        ArrayList<String> arrayList = new ArrayList<>();
        String path = "D:\\develop\\project\\toString\\chrome-win\\chrome.exe";

        LaunchOptions options = new OptionsBuilder().withArgs(arrayList).withHeadless(true).withExecutablePath(path).build();
        arrayList.add("--no-sandbox");
        arrayList.add("--disable-setuid-sandbox");
        Browser browser = Puppeteer.launch(options);
        Page page = browser.newPage();
        page.goTo("https://www.baidu.com/?tn=98012088_10_dg&ch=3");


//        ScreenshotOptions screenshotOptions = new ScreenshotOptions();
//        //设置截图范围
//        Clip clip = new Clip(1.0,1.56,400,400);
//        screenshotOptions.setClip(clip);
//        //设置存放的路径
//        screenshotOptions.setPath("test.png");
//        page.screenshot(screenshotOptions);
        ScreenshotOptions screenshotOptions = new ScreenshotOptions();
        //设置截图范围
        Clip clip = new Clip(1.0,1.56,400,400);
        screenshotOptions.setClip(clip);
        //设置存放的路径
        screenshotOptions.setPath("test.png");
        page.screenshot(screenshotOptions);

    }
}

文件选择

package com.ruiyun.example;

import com.ruiyun.jvppeteer.core.Puppeteer;
import com.ruiyun.jvppeteer.core.browser.Browser;
import com.ruiyun.jvppeteer.core.page.ElementHandle;
import com.ruiyun.jvppeteer.core.page.FileChooser;
import com.ruiyun.jvppeteer.core.page.Page;
import com.ruiyun.jvppeteer.options.LaunchOptions;
import com.ruiyun.jvppeteer.options.OptionsBuilder;
import com.ruiyun.jvppeteer.options.PageNavigateOptions;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;

public class PageFileChooserExample {

    public static void main(String[] args) throws InterruptedException, ExecutionException, IOException {
//        String path = new String("F:\\java教程\\49期\\vuejs\\puppeteer\\.local-chromium\\win64-722234\\chrome-win\\chrome.exe".getBytes(),"UTF-8");
        ArrayList<String> arrayList = new ArrayList<>();
        String path = "D:\\develop\\project\\toString\\chrome-win\\chrome.exe";

        LaunchOptions options = new OptionsBuilder().withArgs(arrayList).withHeadless(false).withExecutablePath(path).build();
        arrayList.add("--no-sandbox");
        arrayList.add("--disable-setuid-sandbox");
        Browser browser = Puppeteer.launch(options);

        Page page = browser.newPage();
        PageNavigateOptions options1 = new PageNavigateOptions();
        options1.setWaitUntil(Arrays.asList("domcontentloaded"));
        page.goTo("https://www.baidu.com/?tn=98012088_10_dg&ch=3");
        Future<FileChooser> fileChooserFuture = page.waitForFileChooser(30000);
        ElementHandle elementHandle = page.$("#form > span.bg.s_ipt_wr.quickdelete-wrap > span.soutu-btn");
        elementHandle.click();
        //点击选择文件的按钮
        ElementHandle button = page.$("#form > div > div.soutu-state-normal > div.upload-wrap > input");
        button.click();
        //等待一个选择文件的弹窗事件返回
        FileChooser fileChooser = fileChooserFuture.get();

        //选择本地的文件
        List<String> paths = new ArrayList<>();
        paths.add("C:\\Users\\howay\\Desktop\\sunway.png");
        fileChooser.accept(paths);

    }
}

除此之外,还有更多的功能,Jvppeteer可以做:

  • 生成页面 PDF。
  • 抓取 SPA(单页应用)并生成预渲染内容(即“SSR”(服务器端渲染))。
  • 自动提交表单,进行 UI 测试,键盘输入等。
  • 创建一个时时更新的自动化测试环境。 使用最新的 JavaScript 和浏览器功能直接在最新版本的Chrome中执行测试。
  • 捕获网站的 timeline trace,用来帮助分析性能问题。
  • 测试浏览器扩展。
  • 本文暂时没有评论,来添加一个吧(●'◡'●)

    欢迎 发表评论:

    最近发表
    标签列表