手把手教做小偷采集
小偷采集,有经验的猿猿应该都会做,我借此做一个回忆。
2013年我就职盛大文学,当时因文学版权、流量等问题,做了一套监控系统,用来监控当时的创世、龙空、纵横等等比较知名中文网站。对于监控,我还可以自吹一下经验满满。
当下社会,正是监控系统兴风作浪的大好时机,而且有利可图。
举例说明一下:
1、公共wifi,收集用户的部分信息,地理坐标,连接时长,搜索内容等等,通过数据分析给用户贴上标签[吃货]、[美娇娘]、[剁手党] 等等,然后将信息打包贩卖,基本能买到好几块一条呐。
2、通过采集获得大量的优秀文章,将其精修后可作为一些书籍的底稿来用,而且这些底稿可以换钱滴。
3、搜索大网站的用户信息,比如就搜索cnblogs的推荐博客博主,把他们的信息down下来,整理分析,贴上[淫才]标签,打包贩卖,至少得30块一条吧。
4、如果能有幸通过一些高级酒店的网站,搜索到一些零碎的用户信息,并将其整理、拼接、合成比较完整的用户信息,这些皇冠级用户信息拿到4S、售楼处贩卖,怎么着一条信息也得小一百吧。
…
不用再写下去了,监控能创造出巨额利益。
开始教做最简单的小偷。当然这篇小偷偷的就是cnblogs,学会了可不要乱搞,搞坏了谁负责?
java编写,需要引入绿色框的包 mysql-connector-java-5.1.13.jar,用来连接mysql数据库,如果采集信息不入库,则可以不用下载此包。
jdk1.8.0_112
黄色框内是方法:
downhtml 下载HTML内容
downImages 下载图片
GetDocument 下载博客日志
GetList 下载精华区列表
InsertMysql 博客入库
main 程序入口
蓝色框内是正则匹配式和一些配置 信息:
contectPattern 正则匹配出内容
imgPattern 正则匹配出图片
listPattern 正则匹配出精华区列表
localPath 下载的图片本地存放路径
pickFormat 精华区url Formatter
webSite 你的网站
主要结构
蓝色框总剖析
// 精选列表Formatter private static String pickFormat = "https://www.cnblogs.com/pick/%s/"; //获取图片url的正则表达式 private static Pattern imgPattern = Pattern.compile("\"(?<head>http(s|))://(?<url>[^\"]+).(?<tp>PNG|png|JPG|jpg|GIF|gif)\""); //获取精华列表的正则表达式 private static Pattern listPattern = Pattern.compile("<div class=\"post_item_body\">\\s*<h3><a class=\"titlelnk\" href=\"(?<url>[^\"]+)\" target=\"_blank\">(?<title>[^<]+)</a></h3>\\s*<p class=\"post_item_summary\">\\s*(<a[\\s\\S]+?alt=\"\"/></a>){0,1}(?<connect>[^<]+)</p>\\s*<div class=\"post_item_foot\">\\s*<a[\\s\\S]+?<span class=\"article_comment\"><a[\\s\\S]+?class=\"gray\">(?<ping>[^<]+)</a></span><span class=\"article_view\"><a[\\s\\S]+?class=\"gray\">(?<yue>[^<]+)</a></span></div>\\s*</div>"); //获取博客内容的正则表达式 private static Pattern contectPattern = Pattern.compile("<div id=\"zmdao_post_body\">(?<conect>[\\s\\S]+?)</div><div id=\"MySignature\"></div>"); //网站放置下载图片的路径 private static String webSite = "http://www.Website.com/loadimages/"; //本地下载图片的路径 private static String localPath = "D:\\WWW\\loadimages\\";
以上内容不解释。
DownHtml剖析
/** * 下载html * * @param url * 博客URL * @return html的内容 */ static String downhtml(String url) throws UnsupportedEncodingException, IOException{ BufferedReader br = new BufferedReader(new InputStreamReader(new URL(url).openStream(), "utf-8")); StringBuffer stringBuffer = new StringBuffer(); String read = ""; while ((read = br.readLine()) != null) { if (stringBuffer.length() == 0) { stringBuffer.append(read); } else { stringBuffer.append("\r\n").append(read); } } br.close(); return stringBuffer.toString(); }
DownImages剖析
/** * 下载网络图片到本地 * * @param imgUrl * 图片URL * @param imgName * 保存到本地名称 */ static void downImages(String imgUrl, String imgName) { System.out.println("downfile --> " + imgName + "\t" + imgUrl); try { URL url = new URL(imgUrl); URLConnection conn = url.openConnection(); conn.setConnectTimeout(10000); InputStream inStream = conn.getInputStream(); FileOutputStream fs = new FileOutputStream(imgName); int byteread = 0; byte[] buffer = new byte[1204]; while ((byteread = inStream.read(buffer)) != -1) { fs.write(buffer, 0, byteread); } fs.close(); } catch (IOException e) { System.err.println(e.getStackTrace()); } }
GetList剖析
/** * 下载精华区列表 */ static void GetList() { for (int i = 1; i < 80; i++) { try { Thread.sleep(7777); String url = String.format(pickFormat, i); //得到第一条是https://www.cnblogs.com/pick/1/ 精华区列表第一页 String html = downhtml(url);//下载精华区 Matcher listMatcher = listPattern.matcher(html);//匹配精华区列表 while (listMatcher.find()) { String title = listMatcher.group("title");//博客标题 url = listMatcher.group("url");//博客URL System.out.println(title + "\t" + url); GetDocument(url, title);//下载博客内容 Thread.sleep(7777); } } catch (Exception e) { System.err.println(e.getStackTrace()); } } }
GetDocument剖析
/** * 下载博客日志 */ static void GetDocument(String url, String title) { try { String html = downhtml(url);//下载博客日志 System.out.println("html.length --> " + html.length()); Matcher contectMatcher = contectPattern.matcher(html);//匹配博客内容 if (contectMatcher.find()) { String content = contectMatcher.group("conect");//获得博客内容 //图片的url去重复 HashMap<String, String> map = new HashMap<String, String>(); Matcher imgMatcher = imgPattern.matcher(content);//匹配博客图片url while (imgMatcher.find()) { String matVal = imgMatcher.group(); String webUrl = matVal.substring(1, matVal.length() - 1); if (map.containsKey(webUrl)) continue; String fileName = UUID.randomUUID().toString().replace("-", "") + "." + imgMatcher.group("tp");//保存本地随机生成图片名 String fullName = localPath + fileName;//保存图片的全路径 String webFileName = webSite + fileName;//博客内容需要转换的新图片url downImages(webUrl, fullName);//下载图片 map.put(webUrl, webFileName); try { Thread.sleep(333); } catch (InterruptedException e) { System.err.println(e.getStackTrace()); } } //将cnblogs的图片url替换成预设网站的url for (Entry<String, String> entry : map.entrySet()) { content = content.replace(entry.getKey(), entry.getValue()); } System.out.println("Match content --> " + content.substring(0, 50)); InsertMysql(title, content);//入库 } } catch (IOException e) { System.err.println(e.getStackTrace()); } }
InsertMysql剖析
//博客入库 static void InsertMysql(String title, String content) { String url = "jdbc:mysql://localhost:3306/wordpress"; String user = "root"; String password = "root"; Connection connection = null; PreparedStatement preparedStatement = null; try { Class.forName("com.mysql.jdbc.Driver"); connection = DriverManager.getConnection(url, user, password); preparedStatement = connection .prepareStatement("INSERT INTO wordpress.wp_posts " + "(post_author, " + "post_date, " + "post_date_gmt, " + "post_content, " + "post_title, " + "post_excerpt, " + "post_status, " + "comment_status, " + "ping_status, " + "post_password, " + "post_name, " + "to_ping, " + "pinged, " + "post_modified, " + "post_modified_gmt, " + "post_content_filtered, " + "post_parent, " + "guid, " + "menu_order, " + "post_type, " + "post_mime_type, " + "comment_count" + ")" + "VALUES" + "(1, " + "now(), " + "now(), " + "?, " + "?, " + "'', " + "'publish', " + "'open', " + "'open', " + "'', " + "'', " + "'', " + "'', " + "now(), " + "now(), " + "'', " + "0, " + "'', " + "0, " + "'post', " + "'', " + "0" + ");"); preparedStatement.setString(1, content); preparedStatement.setString(2, title); System.out.println(preparedStatement.executeUpdate() + " " + title); } catch (Exception e) { System.err.println(e.getStackTrace()); } finally { if (preparedStatement != null) try { preparedStatement.close(); } catch (SQLException e) { System.err.println(e.getStackTrace()); } if (connection != null) try { connection.close(); } catch (SQLException e) { System.err.println(e.getStackTrace()); } } }
完整代码(本人喜欢随性乱涂,代码基本无注释,抱歉)
import java.io.BufferedReader; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLConnection; import java.util.HashMap; import java.util.Map.Entry; import java.util.UUID; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.SQLException; public class appCnblogsCollect { // 精选列表Formatter private static String pickFormat = "https://www.cnblogs.com/pick/%s/"; //获取图片url的正则表达式 private static Pattern imgPattern = Pattern.compile("\"(?<head>http(s|))://(?<url>[^\"]+).(?<tp>PNG|png|JPG|jpg|GIF|gif)\""); //获取精华列表的正则表达式 private static Pattern listPattern = Pattern.compile("<div class=\"post_item_body\">\\s*<h3><a class=\"titlelnk\" href=\"(?<url>[^\"]+)\" target=\"_blank\">(?<title>[^<]+)</a></h3>\\s*<p class=\"post_item_summary\">\\s*(<a[\\s\\S]+?alt=\"\"/></a>){0,1}(?<connect>[^<]+)</p>\\s*<div class=\"post_item_foot\">\\s*<a[\\s\\S]+?<span class=\"article_comment\"><a[\\s\\S]+?class=\"gray\">(?<ping>[^<]+)</a></span><span class=\"article_view\"><a[\\s\\S]+?class=\"gray\">(?<yue>[^<]+)</a></span></div>\\s*</div>"); //获取博客内容的正则表达式 private static Pattern contectPattern = Pattern.compile("<div id=\"zmdao_post_body\">(?<conect>[\\s\\S]+?)</div><div id=\"MySignature\"></div>"); //网站放置下载图片的路径 private static String webSite = "http://www.Website.com/loadimages/"; //本地下载图片的路径 private static String localPath = "D:\\WWW\\loadimages\\"; public static void main(String[] args) throws Exception { GetList(); } /** * 下载精华区列表 */ static void GetList() { for (int i = 1; i < 80; i++) { try { Thread.sleep(7777); String url = String.format(pickFormat, i); //得到第一条是https://www.cnblogs.com/pick/1/ 精华区列表第一页 String html = downhtml(url);//下载精华区 Matcher listMatcher = listPattern.matcher(html); while (listMatcher.find()) { String title = listMatcher.group("title"); url = listMatcher.group("url"); System.out.println(title + "\t" + url); GetDocument(url, title); Thread.sleep(7777); } } catch (Exception e) { System.err.println(e.getStackTrace()); } } } /** * 下载博客日志 */ static void GetDocument(String url, String title) { try { String html = downhtml(url); System.out.println("html.length --> " + html.length()); Matcher contectMatcher = contectPattern.matcher(html); if (contectMatcher.find()) { String content = contectMatcher.group("conect"); //图片的url去重复 HashMap<String, String> map = new HashMap<String, String>(); Matcher imgMatcher = imgPattern.matcher(content); while (imgMatcher.find()) { String matVal = imgMatcher.group(); String webUrl = matVal.substring(1, matVal.length() - 1); if (map.containsKey(webUrl)) continue; String fileName = UUID.randomUUID().toString().replace("-", "") + "." + imgMatcher.group("tp"); String fullName = localPath + fileName; String webFileName = webSite + fileName; downImages(webUrl, fullName); map.put(webUrl, webFileName); try { Thread.sleep(333); } catch (InterruptedException e) { System.err.println(e.getStackTrace()); } } //将cnblogs的图片url替换成预设网站的url for (Entry<String, String> entry : map.entrySet()) { content = content.replace(entry.getKey(), entry.getValue()); } System.out.println("Match content --> " + content.substring(0, 50)); InsertMysql(title, content); } } catch (IOException e) { System.err.println(e.getStackTrace()); } } /** * 下载html * * @param url * 博客URL * @return html的内容 */ static String downhtml(String url) throws UnsupportedEncodingException, IOException{ BufferedReader br = new BufferedReader(new InputStreamReader(new URL(url).openStream(), "utf-8")); StringBuffer stringBuffer = new StringBuffer(); String read = ""; while ((read = br.readLine()) != null) { if (stringBuffer.length() == 0) { stringBuffer.append(read); } else { stringBuffer.append("\r\n").append(read); } } br.close(); return stringBuffer.toString(); } /** * 下载网络图片到本地 * * @param imgUrl * 图片URL * @param imgName * 保存到本地名称 */ static void downImages(String imgUrl, String imgName) { System.out.println("downfile --> " + imgName + "\t" + imgUrl); try { URL url = new URL(imgUrl); URLConnection conn = url.openConnection(); conn.setConnectTimeout(10000); InputStream inStream = conn.getInputStream(); FileOutputStream fs = new FileOutputStream(imgName); int byteread = 0; byte[] buffer = new byte[1204]; while ((byteread = inStream.read(buffer)) != -1) { fs.write(buffer, 0, byteread); } fs.close(); } catch (IOException e) { System.err.println(e.getStackTrace()); } } static void InsertMysql(String title, String content) { String url = "jdbc:mysql://localhost:3306/wordpress"; String user = "root"; String password = "root"; Connection connection = null; PreparedStatement preparedStatement = null; try { Class.forName("com.mysql.jdbc.Driver"); connection = DriverManager.getConnection(url, user, password); preparedStatement = connection .prepareStatement("INSERT INTO wordpress.wp_posts " + "(post_author, " + "post_date, " + "post_date_gmt, " + "post_content, " + "post_title, " + "post_excerpt, " + "post_status, " + "comment_status, " + "ping_status, " + "post_password, " + "post_name, " + "to_ping, " + "pinged, " + "post_modified, " + "post_modified_gmt, " + "post_content_filtered, " + "post_parent, " + "guid, " + "menu_order, " + "post_type, " + "post_mime_type, " + "comment_count" + ")" + "VALUES" + "(1, " + "now(), " + "now(), " + "?, " + "?, " + "'', " + "'publish', " + "'open', " + "'open', " + "'', " + "'', " + "'', " + "'', " + "now(), " + "now(), " + "'', " + "0, " + "'', " + "0, " + "'post', " + "'', " + "0" + ");"); preparedStatement.setString(1, content); preparedStatement.setString(2, title); System.out.println(preparedStatement.executeUpdate() + " " + title); } catch (Exception e) { System.err.println(e.getStackTrace()); } finally { if (preparedStatement != null) try { preparedStatement.close(); } catch (SQLException e) { System.err.println(e.getStackTrace()); } if (connection != null) try { connection.close(); } catch (SQLException e) { System.err.println(e.getStackTrace()); } } } }
完整代码
以上是一个不需要用户身份验证的例子,在某些特殊的情况下,需要身份验证怎么办?不要着急,我下面还有一个例子,下面的例子是年末统计全年工作日报,而且这个工作日报还影响绩效和考核,可是,我一年没填了,从头写是不可能了,只能做个小工具,于是就有了它,随便乱弹的,看着丑就丑吧。
package ebooks; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.List; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpPost; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingClientConnectionManager; import org.apache.http.message.BasicNameValuePair; public class appTimeSheet { public static void main(String[] args) throws Exception { String username = "username"; String password = "yourpassword"; SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss"); String urlLogin = "http://10.54.11.37:8989/login"; String urlSave = "http://10.54.11.37:8989/agenda/submission/saveTimereg.json"; DefaultHttpClient client = new DefaultHttpClient(new PoolingClientConnectionManager()); HttpPost loginHttpPost = new HttpPost(urlLogin); List<NameValuePair> loginPairs = new ArrayList<NameValuePair>(); loginPairs.add(new BasicNameValuePair("username", username)); loginPairs.add(new BasicNameValuePair("password", password)); loginHttpPost.setEntity(new UrlEncodedFormEntity(loginPairs, "utf-8")); HttpResponse response = client.execute(loginHttpPost); System.out.println(response.getStatusLine()); Calendar calendar = Calendar.getInstance(); calendar.clear(); calendar.set(2017, 11, 24); long end = calendar.getTimeInMillis(); calendar.clear(); calendar.set(2017, 11, 4); for (; calendar.getTimeInMillis() < end; calendar.add(Calendar.DATE, 1)) { if (calendar.get(Calendar.DAY_OF_WEEK) == Calendar.SUNDAY || calendar.get(Calendar.DAY_OF_WEEK) == Calendar.SATURDAY) { continue; } HttpPost savePost = new HttpPost(urlSave); List<NameValuePair> savePairs = new ArrayList<NameValuePair>(); savePairs.add(new BasicNameValuePair("timereg", "监控室")); savePairs.add(new BasicNameValuePair("timereg", "研发项目")); savePairs.add(new BasicNameValuePair("timereg", "云平台智能服务技术的研究和应用")); savePairs.add(new BasicNameValuePair("timereg", "全部(共享类)")); switch (calendar.get(Calendar.DAY_OF_WEEK)) { case Calendar.MONDAY: savePairs.add(new BasicNameValuePair("timereg", "8,0,0,0,0,0,0")); System.out.println("星期一"); break; case Calendar.TUESDAY: savePairs.add(new BasicNameValuePair("timereg", "0,8,0,0,0,0,0")); System.out.println("星期二"); break; case Calendar.WEDNESDAY: savePairs.add(new BasicNameValuePair("timereg", "0,0,8,0,0,0,0")); System.out.println("星期三"); break; case Calendar.THURSDAY: savePairs.add(new BasicNameValuePair("timereg", "0,0,0,8,0,0,0")); System.out.println("星期四"); break; case Calendar.FRIDAY: savePairs.add(new BasicNameValuePair("timereg", "0,0,0,0,8,0,0")); System.out.println("星期五"); break; } savePairs.add(new BasicNameValuePair("timereg", "监控系统开发与应用")); savePairs.add(new BasicNameValuePair("timereg", "8")); savePairs .add(new BasicNameValuePair("theweek", Integer.toString(calendar.get(Calendar.WEEK_OF_YEAR) + 1))); System.out.println("第" + (calendar.get(Calendar.WEEK_OF_YEAR) + 1) + "周"); savePairs.add(new BasicNameValuePair("starttime", simpleDateFormat.format(calendar.getTime()))); System.out.println("starttime=" + simpleDateFormat.format(calendar.getTime())); savePost.setEntity(new UrlEncodedFormEntity(savePairs, "utf-8")); HttpResponse saveResponse = client.execute(savePost); System.out.println(saveResponse.getStatusLine()); System.out.println(); Thread.sleep(7777); } } }
送一个需要帐号密码登录的例子