一、爬虫介绍:请查看我的上篇文章 “Java爬虫_静态页面”
二、动态爬虫工具介绍:
1、IDEA,开发工具,创建Maven项目
2、htmlunit:是自动化测试工具,集成了下载(HttpClient),Dom(NekoHtml),驱动JS(Rhino)
3、其它JAR包:junit、jsoup、jxl
三、开发过程及相关代码
3.1、创建Maven项目
image.png
3.2、pom.xml中添加项目依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cll</groupId>
<artifactId>demo</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<maven.compiler.encoding>UTF-8</maven.compiler.encoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.27</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.10.1</version>
</dependency>
<dependency>
<groupId>com.hynnet</groupId>
<artifactId>jxl</artifactId>
<version>2.6.12.1</version>
</dependency>
</dependencies>
</project>
3.3、创建一个java类QYEmailHelper.java
import CaililiangTools.ConfigHelper;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Properties;
public class QYEmailHelper {
static WebClient webClient=new WebClient(BrowserVersion.CHROME);
ArrayList<HashMap<String,String>> returnList = new ArrayList<HashMap<String,String>>();
static String baseUrl ="";
static int num =1;
ConfigHelper configHelper = new ConfigHelper();
Properties properties=null;
//浏览器初始化
public void WebClientInit(){
webClient.getCookieManager().setCookiesEnabled(true);//设置cookie是否可用
webClient.getOptions().setActiveXNative(false);
webClient.getOptions().setRedirectEnabled(true);// 启动客户端重定向
webClient.getOptions().setCssEnabled(false);//禁用Css,可避免自动二次请求CSS进行渲染
webClient.getOptions().setJavaScriptEnabled(true); // 启动JS
webClient.getOptions().setUseInsecureSSL(true);//忽略ssl认证
webClient.getOptions().setThrowExceptionOnScriptError(false);//运行错误时,不抛出异常
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());// 设置Ajax异步
webClient.getOptions().setMaxInMemory(50000);
properties = configHelper.getEmailUserInfos();
}
public void closeWebClient(){
webClient.close();
webClient=new WebClient(BrowserVersion.CHROME);
}
//用户登录并返回收件箱的地址
public String UserLogin(String url,String name,String password) throws Exception{
url = url.replace("param=caill@primeton.com","param="+name);
final HtmlPage page = webClient.getPage(url);
System.err.println("查询中,请稍候");
//TimeUnit.SECONDS.sleep(3); //web请求数据需要时间,必须让主线程休眠片刻
HtmlForm form=page.getForms().get(0);
HtmlPasswordInput txtPwd = (HtmlPasswordInput)form.getInputByName("pp");//密码框
txtPwd.setValueAttribute(password);//设置密码
HtmlSubmitInput submit=(HtmlSubmitInput) form.getInputByValue("登录");
final HtmlPage page2 = (HtmlPage) submit.click();//登录进入
DomElement e =page2.getElementById("folder_1");
HtmlPage page3 = webClient.getPage("https://mail.primeton.com"+e.getAttribute("href"));
//TimeUnit.SECONDS.sleep(3); //web请求数据需要时间,必须让主线程休眠片刻
HtmlInlineFrame frame1 = (HtmlInlineFrame)page3.getElementById("mainFrame");
String src = frame1.getAttribute("src");
baseUrl="https://mail.primeton.com"+src;
return "https://mail.primeton.com"+src;
}
//抓取Url中的数据
public long getHtmlPage(String url,long startTime,long endTime) throws Exception{
HashMap<String,String> returnMap = new HashMap<String,String>();
long endTime2=0L;
HtmlPage page = webClient.getPage(url);
HtmlBody tbody = (HtmlBody) page.getBody();
DomNodeList<HtmlElement> lists = tbody.getElementsByTagName("table");
//System.out.println( page.asXml());
for(HtmlElement he:lists){
long time =0L;
HashMap<String,String> results = new HashMap<String,String>();
String xml = he.asXml();
if(xml.startsWith("<table cellspacing=\"0\" class=") && xml.contains("<input totime=")){
Document document = Jsoup.parse(xml);
Elements es = document.getElementsByClass("cx");
Elements es2 = document.getElementsByClass("black");
for(Element e :es){
Node node =e.childNode(1);
time = Long.parseLong(node.attr("totime"));
endTime2 = time;
String email = node.attr("fa");
if(properties.containsKey(email)){
String value = properties.getProperty(email);
String[] vs = value.split("@@");
if(vs.length==2){
results.put("totime",new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date(time)));
results.put("unread",(node.attr("unread")).equalsIgnoreCase("true")?"已读":"未读");
results.put("name",vs[1]);
results.put("mail",email);
results.put("dept",vs[0]);
}else{
results.put("totime",new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date(time)));
results.put("unread",(node.attr("unread")).equalsIgnoreCase("true")?"已读":"未读");
results.put("name",node.attr("fn"));
results.put("mail",email);
results.put("dept","");
}
}else{
results.put("totime",new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date(time)));
results.put("unread",(node.attr("unread")).equalsIgnoreCase("true")?"已读":"未读");
results.put("name",node.attr("fn"));
results.put("mail",email);
results.put("dept","");
}
}
for(Element e :es2){
results.put("title",e.ownText());
}
if(time<=endTime && startTime<=time){
returnList.add(results);
}
}
};
return endTime2;
}
public void grabData(String url,long startTime,long endTime) throws Exception{
long endTime2 = getHtmlPage(url,startTime,endTime);
if(endTime2>startTime){
String nextPageUrl=baseUrl.replace("page=0","page="+num);
num++;
grabData(nextPageUrl,startTime,endTime);
}
}
public int exportData(long startTime,long endTime,String name,String password){
int returnInt = 0;
this.WebClientInit();
String webUrl="https://mail.primeton.com/cgi-bin/loginpage?t=logindomain&s=logout&f=biz¶m=caill@primeton.com";
String url1 = null;
try {
url1 = this.UserLogin(webUrl,name,password);
grabData(url1,startTime,endTime);
ExcelHelper excelHelper = new ExcelHelper();
excelHelper.exportExcel(this.returnList);
num=1;
baseUrl ="";
returnList = new ArrayList<HashMap<String,String>>();
closeWebClient();
} catch (Exception e) {
returnInt = 1;
num=1;
baseUrl ="";
returnList = new ArrayList<HashMap<String,String>>();
closeWebClient();
}
return returnInt;
}
}
四、总结
上述代码可以完成动态代码的获取(以上代码是一个示例的一部分,单独运行会报错,代码只供参考,下篇文章会给出全部的代码),同时注意:开发爬虫时尽可能去创建Maven项目,如果创建普通项目需要引入一大串东西,否则一直会报错,本人折腾了半天还是有问题,就放弃了。
五、注意点
5.1、页面内嵌的Iframe是不能直接解析到内容的,必须先解析获取其url,然后再通过url再次获取页面数据进行解析












网友评论