使用java怎么爬取代理IP
作者
本篇文章为大家展示了使用java怎么爬取代理IP,内容简明扼要并且容易理解,绝对能使你眼前一亮,通过这篇文章的详细介绍希望你能有所收获。
<dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.28</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </dependency>
完整的代码如下:
packagecom.tuniu.fcm.facade.IPProxy; importcom.alibaba.fastjson.JSONObject; importorg.jsoup.Jsoup; importorg.jsoup.nodes.Document; importjava.util.ArrayList; importjava.util.HashMap; importjava.util.List; importjava.util.Map; importjava.util.regex.Matcher; importjava.util.regex.Pattern; /** *获取代理IP,需要 *com.alibaba.fastjson.JSONObject以及Jsoup */ publicclassProxyCralwerUnusedVPN{ ThreadLocal<Integer>localWantedNumber=newThreadLocal<Integer>(); ThreadLocal<List<ProxyInfo>>localProxyInfos=newThreadLocal<List<ProxyInfo>>(); publicstaticvoidmain(String[]args){ ProxyCralwerUnusedVPNproxyCrawler=newProxyCralwerUnusedVPN(); /** *想要获取的代理IP个数,由需求方自行指定。(如果个数太多,将导致返回变慢) */ proxyCrawler.startCrawler(1); } /** *暴露给外部模块调用的入口 *@paramwantedNumber调用方期望获取到的代理IP个数 */ publicStringstartCrawler(intwantedNumber){ localWantedNumber.set(wantedNumber); kuaidailiCom("http://www.xicidaili.com/nn/",15); kuaidailiCom("http://www.xicidaili.com/nt/",15); kuaidailiCom("http://www.xicidaili.com/wt/",15); kuaidailiCom("http://www.kuaidaili.com/free/inha/",15); kuaidailiCom("http://www.kuaidaili.com/free/intr/",15); kuaidailiCom("http://www.kuaidaili.com/free/outtr/",15); /** *构造返回数据 */ ProxyResponseresponse=newProxyResponse(); response.setSuccess("true"); Map<String,Object>dataInfoMap=newHashMap<String,Object>(); dataInfoMap.put("numFound",localProxyInfos.get().size()); dataInfoMap.put("pageNum",1); dataInfoMap.put("proxy",localProxyInfos.get()); response.setData(dataInfoMap); StringresponseString=JSONObject.toJSON(response).toString(); System.out.println(responseString); returnresponseString; } privatevoidkuaidailiCom(StringbaseUrl,inttotalPage){ StringipReg="\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\d{1,6}"; PatternipPtn=Pattern.compile(ipReg); for(inti=1;i<totalPage;i++){ if(getCurrentProxyNumber()>=localWantedNumber.get()){ return; } try{ Documentdoc=Jsoup.connect(baseUrl+i+"/") .header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") .header("Accept-Encoding","gzip,deflate,sdch") .header("Accept-Language","zh-CN,zh;q=0.8,en;q=0.6") .header("Cache-Control","max-age=0") .header("User-Agent","Mozilla/5.0(Macintosh;IntelMacOSX10_11_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/51.0.2704.103Safari/537.36") .header("Cookie","Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244;_gat=1;_ga=GA1.2.1061361785.1462812244") .header("Host","www.kuaidaili.com") .header("Referer","http://www.kuaidaili.com/free/outha/") .timeout(30*1000) .get(); Matcherm=ipPtn.matcher(doc.text()); while(m.find()){ if(getCurrentProxyNumber()>=localWantedNumber.get()){ break; } String[]strs=m.group().split(""); if(checkProxy(strs[0],Integer.parseInt(strs[1]))){ System.out.println("获取到可用代理IP\t"+strs[0]+"\t"+strs[1]); addProxy(strs[0],strs[1],"http"); } } }catch(Exceptione){ e.printStackTrace(); } } } privatestaticbooleancheckProxy(Stringip,Integerport){ try{ //http://1212.ip138.com/ic.asp可以换成任何比较快的网页 Jsoup.connect("http://1212.ip138.com/ic.asp") .timeout(2*1000) .proxy(ip,port) .get(); returntrue; }catch(Exceptione){ returnfalse; } } privateintgetCurrentProxyNumber(){ List<ProxyInfo>proxyInfos=localProxyInfos.get(); if(proxyInfos==null){ proxyInfos=newArrayList<ProxyInfo>(); localProxyInfos.set(proxyInfos); return0; } else{ returnproxyInfos.size(); } } privatevoidaddProxy(Stringip,Stringport,Stringprotocol){ List<ProxyInfo>proxyInfos=localProxyInfos.get(); if(proxyInfos==null){ proxyInfos=newArrayList<ProxyInfo>(); proxyInfos.add(newProxyInfo(ip,port,protocol)); } else{ proxyInfos.add(newProxyInfo(ip,port,protocol)); } } } classProxyInfo{ privateStringuserName=""; privateStringip; privateStringpassword=""; privateStringtype; privateStringport; privateintis_internet=1; publicProxyInfo(Stringip,Stringport,Stringtype){ this.ip=ip; this.type=type; this.port=port; } publicStringgetUserName(){ returnuserName; } publicvoidsetUserName(StringuserName){ this.userName=userName; } publicStringgetIp(){ returnip; } publicvoidsetIp(Stringip){ this.ip=ip; } publicStringgetPassword(){ returnpassword; } publicvoidsetPassword(Stringpassword){ this.password=password; } publicStringgetType(){ returntype; } publicvoidsetType(Stringtype){ this.type=type; } publicStringgetPort(){ returnport; } publicvoidsetPort(Stringport){ this.port=port; } publicintgetIs_internet(){ returnis_internet; } publicvoidsetIs_internet(intis_internet){ this.is_internet=is_internet; } } classProxyResponse{ privateStringsuccess; privateMap<String,Object>data; publicStringgetSuccess(){ returnsuccess; } publicvoidsetSuccess(Stringsuccess){ this.success=success; } publicMap<String,Object>getData(){ returndata; } publicvoidsetData(Map<String,Object>data){ this.data=data; } }
上述内容就是使用java怎么爬取代理IP,你们学到知识或技能了吗?如果还想学到更多技能或者丰富自己的知识储备,欢迎关注恰卡编程网行业资讯频道。
目录
推荐阅读
0 条评论
本站已关闭游客评论,请登录或者注册后再评论吧~