想要免费看小说吗?java 爬虫 抓取小说
很多人找不到想要看的小说,今天就教大家一种方法,写个爬虫直接完事,啥都不说了上代码
package com.cn.love.ui;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HttpclientJsoup {
/**
* Httpclient负责发送请求
* Jsoup负责解析
* @param args
* @throws IOException
* @throws ClientProtocolException
*/
public static void main(String[] args) throws ClientProtocolException, IOException {
//创建httpClient对象
HttpClient httpClient = new DefaultHttpClient();
//设置响应时间,设置传输代码时间,设置服务器 链接超时 传输超时 代理服务器
//httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 1000).setParameter(CoreConnectionPNames.SO_TIMEOUT, 1000).setParameter(ConnRoutePNames.DEFAULT_PROXY, new HttpHost("124.88.67.52",83));
//创建get请求 3406013 68章 3432974
String url="";
HttpGet httpGet=new HttpGet("http://www.17k.com/list/493239.html");
// HttpGet httpGet=new HttpGet("http://www.readnovel.com/novel/120040.html");
//向百度服务器发送请求,获取网页源码
HttpResponse response = httpClient.execute(httpGet);
//EntityUtils工具类把网页实体转换成字符串
String content = EntityUtils.toString(response.getEntity(), "utf-8");
//拿到网页内容了 开始解析
Document doc = Jsoup.parse(content);
//使用元素选择器选择 网页内容
Elements a= doc.select(".Main .Volume dd a");
for (int i = 0; i < a.size(); i++) {
Element eurl = a.get(i);
url="http://www.17k.com/"+eurl.attr("href");
HttpGet httpGet1=new HttpGet(url);
HttpResponse response1 = httpClient.execute(httpGet1);
String content1 = EntityUtils.toString(response1.getEntity(), "utf-8");
Document doc1 = Jsoup.parse(content1);
Elements es= doc1.select("h1");
Elements qw= doc1.select(".p #chapterContentWapper");
//System.out.println(es);
// for (Element e:es) {
// System.out.println(e.text()+":"+e.attr("href"));
// }
File file = new File("D:" + File.separator + "修罗武神.txt");
try {
// 注意,这个地方,那个true的参数,代表如果这个文件已经存在了,就把新的内容添加到该文件的最后
// 如果你想重新创建新文件,把true改成false就好了
Writer writer = new OutputStreamWriter(new FileOutputStream(file, true), "UTF-8");
StringBuilder builder = new StringBuilder();
for (int j = 0; j < es.size(); j++) {
Element h1 = es.get(j);
Element test = qw.get(j);
System.out.println(h1.text());
System.out.println(test.text());
builder.append(h1.text());
builder.append("\r\n");
builder.append(test.text());
builder.append("\r\n");
}
writer.write(builder.toString());
writer.close();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}