使用java怎么爬取代理IP

本篇文章为大家展示了使用java怎么爬取代理IP,内容简明扼要并且容易理解,绝对能使你眼前一亮,通过这篇文章的详细介绍希望你能有所收获。

<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.28</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>

完整的代码如下:

使用java怎么爬取代理IP

packagecom.tuniu.fcm.facade.IPProxy;
importcom.alibaba.fastjson.JSONObject;
importorg.jsoup.Jsoup;
importorg.jsoup.nodes.Document;
importjava.util.ArrayList;
importjava.util.HashMap;
importjava.util.List;
importjava.util.Map;
importjava.util.regex.Matcher;
importjava.util.regex.Pattern;
/**
*获取代理IP,需要
*com.alibaba.fastjson.JSONObject以及Jsoup
*/
publicclassProxyCralwerUnusedVPN{
ThreadLocal<Integer>localWantedNumber=newThreadLocal<Integer>();
ThreadLocal<List<ProxyInfo>>localProxyInfos=newThreadLocal<List<ProxyInfo>>();
publicstaticvoidmain(String[]args){
ProxyCralwerUnusedVPNproxyCrawler=newProxyCralwerUnusedVPN();
/**
*想要获取的代理IP个数,由需求方自行指定。(如果个数太多,将导致返回变慢)
*/
proxyCrawler.startCrawler(1);
}
/**
*暴露给外部模块调用的入口
*@paramwantedNumber调用方期望获取到的代理IP个数
*/
publicStringstartCrawler(intwantedNumber){
localWantedNumber.set(wantedNumber);
kuaidailiCom("http://www.xicidaili.com/nn/",15);
kuaidailiCom("http://www.xicidaili.com/nt/",15);
kuaidailiCom("http://www.xicidaili.com/wt/",15);
kuaidailiCom("http://www.kuaidaili.com/free/inha/",15);
kuaidailiCom("http://www.kuaidaili.com/free/intr/",15);
kuaidailiCom("http://www.kuaidaili.com/free/outtr/",15);
/**
*构造返回数据
*/
ProxyResponseresponse=newProxyResponse();
response.setSuccess("true");
Map<String,Object>dataInfoMap=newHashMap<String,Object>();
dataInfoMap.put("numFound",localProxyInfos.get().size());
dataInfoMap.put("pageNum",1);
dataInfoMap.put("proxy",localProxyInfos.get());
response.setData(dataInfoMap);
StringresponseString=JSONObject.toJSON(response).toString();
System.out.println(responseString);
returnresponseString;
}
privatevoidkuaidailiCom(StringbaseUrl,inttotalPage){
StringipReg="\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\d{1,6}";
PatternipPtn=Pattern.compile(ipReg);
for(inti=1;i<totalPage;i++){
if(getCurrentProxyNumber()>=localWantedNumber.get()){
return;
}
try{
Documentdoc=Jsoup.connect(baseUrl+i+"/")
.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.header("Accept-Encoding","gzip,deflate,sdch")
.header("Accept-Language","zh-CN,zh;q=0.8,en;q=0.6")
.header("Cache-Control","max-age=0")
.header("User-Agent","Mozilla/5.0(Macintosh;IntelMacOSX10_11_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/51.0.2704.103Safari/537.36")
.header("Cookie","Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244;_gat=1;_ga=GA1.2.1061361785.1462812244")
.header("Host","www.kuaidaili.com")
.header("Referer","http://www.kuaidaili.com/free/outha/")
.timeout(30*1000)
.get();
Matcherm=ipPtn.matcher(doc.text());
while(m.find()){
if(getCurrentProxyNumber()>=localWantedNumber.get()){
break;
}
String[]strs=m.group().split("");
if(checkProxy(strs[0],Integer.parseInt(strs[1]))){
System.out.println("获取到可用代理IP\t"+strs[0]+"\t"+strs[1]);
addProxy(strs[0],strs[1],"http");
}
}
}catch(Exceptione){
e.printStackTrace();
}
}
}
privatestaticbooleancheckProxy(Stringip,Integerport){
try{
//http://1212.ip138.com/ic.asp可以换成任何比较快的网页
Jsoup.connect("http://1212.ip138.com/ic.asp")
.timeout(2*1000)
.proxy(ip,port)
.get();
returntrue;
}catch(Exceptione){
returnfalse;
}
}
privateintgetCurrentProxyNumber(){
List<ProxyInfo>proxyInfos=localProxyInfos.get();
if(proxyInfos==null){
proxyInfos=newArrayList<ProxyInfo>();
localProxyInfos.set(proxyInfos);
return0;
}
else{
returnproxyInfos.size();
}
}
privatevoidaddProxy(Stringip,Stringport,Stringprotocol){
List<ProxyInfo>proxyInfos=localProxyInfos.get();
if(proxyInfos==null){
proxyInfos=newArrayList<ProxyInfo>();
proxyInfos.add(newProxyInfo(ip,port,protocol));
}
else{
proxyInfos.add(newProxyInfo(ip,port,protocol));
}
}
}
classProxyInfo{
privateStringuserName="";
privateStringip;
privateStringpassword="";
privateStringtype;
privateStringport;
privateintis_internet=1;
publicProxyInfo(Stringip,Stringport,Stringtype){
this.ip=ip;
this.type=type;
this.port=port;
}
publicStringgetUserName(){
returnuserName;
}
publicvoidsetUserName(StringuserName){
this.userName=userName;
}
publicStringgetIp(){
returnip;
}
publicvoidsetIp(Stringip){
this.ip=ip;
}
publicStringgetPassword(){
returnpassword;
}
publicvoidsetPassword(Stringpassword){
this.password=password;
}
publicStringgetType(){
returntype;
}
publicvoidsetType(Stringtype){
this.type=type;
}
publicStringgetPort(){
returnport;
}
publicvoidsetPort(Stringport){
this.port=port;
}
publicintgetIs_internet(){
returnis_internet;
}
publicvoidsetIs_internet(intis_internet){
this.is_internet=is_internet;
}
}
classProxyResponse{
privateStringsuccess;
privateMap<String,Object>data;
publicStringgetSuccess(){
returnsuccess;
}
publicvoidsetSuccess(Stringsuccess){
this.success=success;
}
publicMap<String,Object>getData(){
returndata;
}
publicvoidsetData(Map<String,Object>data){
this.data=data;
}
}

上述内容就是使用java怎么爬取代理IP,你们学到知识或技能了吗?如果还想学到更多技能或者丰富自己的知识储备,欢迎关注恰卡编程网行业资讯频道。

发布于 2021-04-08 13:39:11
收藏
分享
海报
0 条评论
163
上一篇:使用Java如何计算圆周率 下一篇:怎么在iOS中查找私有API
目录

    0 条评论

    本站已关闭游客评论,请登录或者注册后再评论吧~

    忘记密码?

    图形验证码