简单的网络爬虫,下载GitHub的头像
基于一个叫Web Magic的爬虫框架(https://github.com/code4craft/webmagic)去开发的,可以爬github的用户的头像到本地
使用Apache的HttpClient发送HttpRequest请求,用JSoup对下载来的html文档进行CSS选择器过滤找到合适的图片链接,再发送请求去下载图片并保存到本地。
1 package webcrawler.webcrawler; 2 import java.awt.im.InputContext; 3 import java.io.ByteArrayInputStream; 4 import java.io.File; 5 import java.io.FileOutputStream; 6 import java.io.IOException; 7 import java.io.InputStream; 8 import java.net.URL; 9 import java.util.ArrayList; 10 import java.util.Date; 11 import java.util.HashMap; 12 import java.util.HashSet; 13 import java.util.Iterator; 14 import java.util.List; 15 import java.util.Map; 16 import java.util.Set; 17 18 import org.apache.http.HttpEntity; 19 import org.apache.http.HttpResponse; 20 import org.apache.http.client.ClientProtocolException; 21 import org.apache.http.client.HttpClient; 22 import org.apache.http.client.methods.HttpGet; 23 import org.apache.http.client.utils.HttpClientUtils; 24 import org.apache.http.impl.client.DefaultHttpClient; 25 import org.apache.http.impl.client.HttpClientBuilder; 26 27 import us.codecraft.webmagic.Page; 28 import us.codecraft.webmagic.Site; 29 import us.codecraft.webmagic.Spider; 30 import us.codecraft.webmagic.pipeline.ConsolePipeline; 31 import us.codecraft.webmagic.processor.PageProcessor; 32 33 public class GithubAvaterDownLoaderProcessor implements PageProcessor { 34 35 private Site site = Site.me().setRetryTimes(3).setSleepTime(100); 36 Set<String> globalSet= new HashSet(); 37 int index=1; 38 39 public void process(Page page) {
/*在主函数里,实例化Spider时,此GithubAvaterDownLoaderProcessor被构造的实例会最终赋到Spider实例里的PageProcessor对象。
这里的process方法最后被调用时,调用时page实例已得到*/
40 List<String> listWithFollowers = new ArrayList<String>(); 41 for(String ori:page.getHtml().links().regex("(https://github\\.com/\\w+)").all()){ 42 listWithFollowers.add(ori+"/followers"); 43 listWithFollowers.add(ori+"/following"); 44 } 45 46 page.addTargetRequests(listWithFollowers); 47 48 MyPage myPage=new MyPage(page); 49 page.putField("nameLinkMap", myPage.getMap()); 50 51 52 this.downLoadAavePicToLocal(page); //发送下载请求并保存到本地 53 globalSet.addAll(myPage.getMap().keySet()); //将访问过名字存入一个set,以后可以用来检查是否某用户已经访问过,存在该set里就可以跳过不去请求了 54 } 55 56 57 public void downloadSavePicToLocal(Page page) { 58 HttpClient client= HttpClientBuilder.create().build(); 59 Map map= page.getResultItems().get("nameLinkMap"); 60 61 Iterator<String> mapItor=map.keySet().iterator(); 62 while (mapItor.hasNext()) { //迭代用户名为key,用户头像下载链接为value的map 63 String name=mapItor.next(); 64 if(globalSet.contains(name)) //检测是否访问过 65 continue; 66 67 String link =(String) map.get(name); 68 HttpGet getRequest= new HttpGet(link); 69 try { 70 HttpResponse response = client.execute(getRequest); 71 HttpEntity entity=response.getEntity(); 72 InputStream is= entity.getContent(); 73 File AvaterFolder= new File(".\\AvatersFolder"); 74 if(!AvaterFolder.exists()) AvaterFolder.mkdirs(); 75 76 File file=new File(AvaterFolder+File.separator+index++ +" "+ name + ".jpg"); 77 78 FileOutputStream fileOutputStream= new FileOutputStream(file); 79 byte[] bytes= new byte[1024]; 80 int length; 81 while((length=is.read(bytes,0,bytes.length))!=-1){ 82 fileOutputStream.write(bytes, 0, length); 83 } 84 fileOutputStream.flush(); 85 is.close(); 86 fileOutputStream.close(); 87 88 } catch (Throwable e) { 89 e.printStackTrace(); 90 } 91 } 92 } 93 94 public Site getSite() { 95 return site; 96 } 97 98 public static void main(String[] args) { 99 Spider.create(new GithubAvaterDownLoaderProcessor()) 100 .addPipeline(new MyConsolePipeline()) 101 .addUrl("https://github.com/code4craft/webmagic/followers") //种子链接 102 .thread(2) 103 .run(); 104 } 105 }
1 package webcrawler.webcrawler; 2 3 4 5 import java.util.ArrayList; 6 import java.util.HashMap; 7 import java.util.List; 8 import java.util.Map; 9 import java.util.regex.Matcher; 10 import java.util.regex.Pattern; 11 12 import javax.management.remote.SubjectDelegationPermission; 13 14 import org.jsoup.Jsoup; 15 import org.jsoup.nodes.Document; 16 import org.jsoup.nodes.Element; 17 18 import us.codecraft.webmagic.Page; 19 import us.codecraft.webmagic.selector.Html; 20 import us.codecraft.webmagic.utils.UrlUtils; 21 22 public class MyPage{ 23 private String rString; 24 private Map<String,String> map; 25 26 public Map getMap(){ 27 return this.map; //用来返回此map,用户名为key,用户头像链接为value 28 } 29 30 public MyPage(Page page) { 31 rString= page.getRawText(); 32 map = new HashMap(); 33 Document document=Jsoup.parse(rString); //用JSoup去parse页面 34 List<Element> listOfElements = document.select("img.gravatar"); //CSS selector 去定位元素 35 for(Element element:listOfElements){ 36 map.put((String)element.attr("alt"), getCleanOne((String)element.attr("src"))); //保存到map, 用户名(被定位元素的alt属性值)为key,该用户头像链接为value 37 } 38 } 39 public String getCleanOne(String s) { 40 Pattern pattern = Pattern.compile("https://avatars\\d\\.githubusercontent\\.com/u/\\d+"); //只寻找符合此条件的链接 41 Matcher matcher=pattern.matcher(s); 42 if(matcher.find()) 43 return matcher.group(); 44 return null; 45 } 46 47 }