package spider;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
//http://webmagic.io/docs/zh/posts/ch4-basic-page-processor/pageprocessor.html
public class MySpider implements PageProcessor{
//抓取网站的相关配置
private Site site=Site.me().setRetryTimes(10).setSleepTime(5000).setTimeOut(5000).addCookie("www.zhihu.com", "unlock_ticket", "QUJBTXRpWGJRd2dYQUFBQVlRSlZUZl83Q2xjZkJISHZkZm13R05Jck93eTNFU2IyUE53LWVnPT0=|"
+ "1460335857|e1d68d4125f73b6280312c3eafa71da1b9fc7cab").addCookie("login", "MWRiZWUxNmMzOTA5NDdmNTkwNGRmNWQyZWZhNDRmY2U=|1475371295|b9e9c165fc1d3c314afa2b66e3ff27c514bb4946").addCookie("Domain","www.zhihu.com").addCookie("z_c0","你的cookie").setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36");
//链接的发现
private static final String URL_answer = "https://www\\.zhihu\\.com/question/\\d+/answer/\\d+";
public static void main(String[] args) {
// TODO Auto-generated method stub
//爬虫的入口以及启动和线程设置
String answerUrl = "https://www.zhihu.com/question/36435092/answer/99247306";
Spider.create(new MySpider()).addUrl(answerUrl).thread(1).run();
}
@Override
public Site getSite() {
// TODO Auto-generated method stub
return site;
}
@Override
//process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
public void process(Page page) {
// TODO Auto-generated method stub
if(page.getUrl().regex(URL_answer).match()){
//定义如何抽取页面信息,并保存下来
List<String> urlList = page.getHtml().xpath("//div[@class=RichContent-inner]//img/@data-original").all();
String questionTitle = page.getHtml().xpath("//h1[@class=QuestionHeader-title]/text()").toString();
System.out.println("题目:"+questionTitle);
System.out.println(urlList);
System.out.println(urlList.size());
List<String> url = new ArrayList<String>();
for (int i=0;i<urlList.size();i=i+2){
url.add(urlList.get(i));
}
String filePath = "D:\\知乎图片\\";
try {
downLoadPics(url,questionTitle,filePath);
} catch (Exception e) {
e.printStackTrace();
}
}
}
public static boolean downLoadPics( List<String> imgUrls,String title, String filePath) throws Exception {
boolean isSuccess = true;
// 文件路径+标题
String dir = filePath +title;
// 创建
File fileDir = new File(dir);
fileDir.mkdirs();
int i = 1;
// 循环下载图片
for (String imgUrl : imgUrls) {
URL url = new URL(imgUrl);
// 打开网络输入流
DataInputStream dis = new DataInputStream(url.openStream());
//定义文件名,同时用随机数防止文件重名
int x=(int)(Math.random()*1000000);
String newImageName = dir + "/" + x+"pic" + i + ".jpg";
// 建立一个新的文件
FileOutputStream fos = new FileOutputStream(new File(newImageName));
byte[] buffer = new byte[1024];
int length;
System.out.println("正在下载......第 " + i + "张图片......请稍后");
// 开始填充数据
while ((length = dis.read(buffer)) > 0) {
fos.write(buffer, 0, length);
}
dis.close();
fos.close();
System.out.println("第 " + i + "张图片下载完毕......");
i++;
}
return isSuccess;
}
}