美文网首页
Java爬虫_动态页面

Java爬虫_动态页面

作者: 笑才 | 来源:发表于2019-03-10 13:16 被阅读0次

一、爬虫介绍:请查看我的上篇文章 “Java爬虫_静态页面”
二、动态爬虫工具介绍:
1、IDEA,开发工具,创建Maven项目
2、htmlunit:是自动化测试工具,集成了下载(HttpClient),Dom(NekoHtml),驱动JS(Rhino)
3、其它JAR包:junit、jsoup、jxl
三、开发过程及相关代码
3.1、创建Maven项目


image.png

3.2、pom.xml中添加项目依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>cll</groupId>
    <artifactId>demo</artifactId>
    <version>1.0-SNAPSHOT</version>
    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
        <maven.compiler.encoding>UTF-8</maven.compiler.encoding>
        <maven.compiler.source>1.7</maven.compiler.source>
        <maven.compiler.target>1.7</maven.compiler.target>
    </properties>

    <dependencies>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.11</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>net.sourceforge.htmlunit</groupId>
            <artifactId>htmlunit</artifactId>
            <version>2.27</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.8.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.10.1</version>
        </dependency>
        <dependency>
            <groupId>com.hynnet</groupId>
            <artifactId>jxl</artifactId>
            <version>2.6.12.1</version>
        </dependency>
    </dependencies>
</project>

3.3、创建一个java类QYEmailHelper.java

import CaililiangTools.ConfigHelper;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;

import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Properties;

public class QYEmailHelper {
    static WebClient webClient=new WebClient(BrowserVersion.CHROME);
    ArrayList<HashMap<String,String>> returnList = new ArrayList<HashMap<String,String>>();
    static String baseUrl ="";
    static  int num =1;
    ConfigHelper configHelper = new ConfigHelper();
    Properties properties=null;

    //浏览器初始化
    public void WebClientInit(){
        webClient.getCookieManager().setCookiesEnabled(true);//设置cookie是否可用
        webClient.getOptions().setActiveXNative(false);
        webClient.getOptions().setRedirectEnabled(true);// 启动客户端重定向
        webClient.getOptions().setCssEnabled(false);//禁用Css,可避免自动二次请求CSS进行渲染
        webClient.getOptions().setJavaScriptEnabled(true); // 启动JS
        webClient.getOptions().setUseInsecureSSL(true);//忽略ssl认证
        webClient.getOptions().setThrowExceptionOnScriptError(false);//运行错误时,不抛出异常
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());// 设置Ajax异步
        webClient.getOptions().setMaxInMemory(50000);
        properties = configHelper.getEmailUserInfos();
    }

    public void closeWebClient(){
        webClient.close();
        webClient=new WebClient(BrowserVersion.CHROME);
    }

    //用户登录并返回收件箱的地址
    public String UserLogin(String url,String name,String password) throws Exception{
        url = url.replace("param=caill@primeton.com","param="+name);
        final HtmlPage page = webClient.getPage(url);
        System.err.println("查询中,请稍候");
        //TimeUnit.SECONDS.sleep(3);  //web请求数据需要时间,必须让主线程休眠片刻
        HtmlForm form=page.getForms().get(0);
        HtmlPasswordInput txtPwd = (HtmlPasswordInput)form.getInputByName("pp");//密码框
        txtPwd.setValueAttribute(password);//设置密码
        HtmlSubmitInput submit=(HtmlSubmitInput) form.getInputByValue("登录");
        final HtmlPage page2 = (HtmlPage) submit.click();//登录进入
        DomElement e =page2.getElementById("folder_1");
        HtmlPage page3 = webClient.getPage("https://mail.primeton.com"+e.getAttribute("href"));
        //TimeUnit.SECONDS.sleep(3);  //web请求数据需要时间,必须让主线程休眠片刻
        HtmlInlineFrame frame1 = (HtmlInlineFrame)page3.getElementById("mainFrame");
        String src = frame1.getAttribute("src");
        baseUrl="https://mail.primeton.com"+src;
        return "https://mail.primeton.com"+src;
    }

    //抓取Url中的数据
    public long getHtmlPage(String url,long startTime,long endTime) throws Exception{
        HashMap<String,String> returnMap = new HashMap<String,String>();
        long endTime2=0L;
        HtmlPage page = webClient.getPage(url);
        HtmlBody tbody = (HtmlBody) page.getBody();
        DomNodeList<HtmlElement> lists = tbody.getElementsByTagName("table");
        //System.out.println( page.asXml());
        for(HtmlElement he:lists){
            long time =0L;
            HashMap<String,String> results = new HashMap<String,String>();
            String xml = he.asXml();
            if(xml.startsWith("<table cellspacing=\"0\" class=") && xml.contains("<input totime=")){
                Document document = Jsoup.parse(xml);
                Elements es = document.getElementsByClass("cx");
                Elements es2 = document.getElementsByClass("black");
                for(Element e :es){
                    Node node =e.childNode(1);
                    time = Long.parseLong(node.attr("totime"));
                    endTime2 = time;
                    String email = node.attr("fa");
                    if(properties.containsKey(email)){
                        String value = properties.getProperty(email);
                        String[] vs = value.split("@@");
                        if(vs.length==2){
                            results.put("totime",new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date(time)));
                            results.put("unread",(node.attr("unread")).equalsIgnoreCase("true")?"已读":"未读");
                            results.put("name",vs[1]);
                            results.put("mail",email);
                            results.put("dept",vs[0]);
                        }else{
                            results.put("totime",new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date(time)));
                            results.put("unread",(node.attr("unread")).equalsIgnoreCase("true")?"已读":"未读");
                            results.put("name",node.attr("fn"));
                            results.put("mail",email);
                            results.put("dept","");
                        }
                    }else{
                        results.put("totime",new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date(time)));
                        results.put("unread",(node.attr("unread")).equalsIgnoreCase("true")?"已读":"未读");
                        results.put("name",node.attr("fn"));
                        results.put("mail",email);
                        results.put("dept","");
                    }
                }
                for(Element e :es2){
                    results.put("title",e.ownText());
                }
                if(time<=endTime && startTime<=time){
                    returnList.add(results);
                }
            }
        };
        return endTime2;
    }

    public void grabData(String url,long startTime,long endTime) throws Exception{
        long endTime2 = getHtmlPage(url,startTime,endTime);
        if(endTime2>startTime){
            String nextPageUrl=baseUrl.replace("page=0","page="+num);
            num++;
            grabData(nextPageUrl,startTime,endTime);
        }
    }

    public int exportData(long startTime,long endTime,String name,String password){
        int returnInt = 0;
        this.WebClientInit();
        String webUrl="https://mail.primeton.com/cgi-bin/loginpage?t=logindomain&s=logout&f=biz&param=caill@primeton.com";
        String url1 = null;
        try {
            url1 = this.UserLogin(webUrl,name,password);
            grabData(url1,startTime,endTime);
            ExcelHelper excelHelper = new ExcelHelper();
            excelHelper.exportExcel(this.returnList);
            num=1;
            baseUrl ="";
            returnList = new ArrayList<HashMap<String,String>>();
            closeWebClient();
        } catch (Exception e) {
            returnInt = 1;
            num=1;
            baseUrl ="";
            returnList = new ArrayList<HashMap<String,String>>();
            closeWebClient();
        }
        return returnInt;
    }
}

四、总结
上述代码可以完成动态代码的获取(以上代码是一个示例的一部分,单独运行会报错,代码只供参考,下篇文章会给出全部的代码),同时注意:开发爬虫时尽可能去创建Maven项目,如果创建普通项目需要引入一大串东西,否则一直会报错,本人折腾了半天还是有问题,就放弃了。

五、注意点
5.1、页面内嵌的Iframe是不能直接解析到内容的,必须先解析获取其url,然后再通过url再次获取页面数据进行解析

相关文章

网友评论

      本文标题:Java爬虫_动态页面

      本文链接:https://www.haomeiwen.com/subject/wosupqtx.html