Python怎么实现爬取腾讯招聘网岗位信息

Python怎么实现爬取腾讯招聘网岗位信息

本篇内容主要讲解“Python怎么实现爬取腾讯招聘网岗位信息”,感兴趣的朋友不妨来看看。本文介绍的方法操作简单快捷,实用性强。下面就让小编来带大家学习“Python怎么实现爬取腾讯招聘网岗位信息”吧!

介绍

开发环境

Windows 10

python3.6

开发工具

pycharm

numpy、matplotlib、time、xlutils.copy、os、xlwt, xlrd, random

效果展示

代码运行展示

实现思路

1.打开腾讯招聘的网址右击检查进行抓包,进入网址的时候发现有异步渲染,我们要的数据为异步加载

2.构造起始地址:

start_url = ‘https://careers.tencent.com/tencentcareer/api/post/Query’

参数在headers的最下面

timestamp: 1625641250509

countryId:

cityId:

bgIds:

productId:

categoryId:

parentCategoryId:

attrId:

keyword:

pageIndex: 1

pageSize: 10

language: zh-cn

area: cn

3.发送请求,获取响应

self.start_url='https://careers.tencent.com/tencentcareer/api/post/Query'#构造请求参数params={#捕捉当前时间戳'timestamp':str(int(time.time()*1000)),'countryId':'','cityId':'','bgIds':'','productId':'','categoryId':'','parentCategoryId':'','attrId':'','keyword':'','pageIndex':str(self.start_page),'pageSize':'10','language':'zh-cn','area':'cn'}headers={'user-agent':random.choice(USER_AGENT_LIST)}response=session.get(url=self.start_url,headers=headers,params=params).json()

4.提取数据,获取岗位信息大列表,提取相应的数据

#获取岗位信息大列表json_data=response['Data']['Posts']#判断结果是否有数据ifjson_dataisNone:#没有数据,设置循环条件为Falseself.is_running=False#反之,开始提取数据else:#循环遍历,取出列表中的每一个岗位字典#通过key取value值的方法进行采集数据fordatainjson_data:#工作地点LocationName=data['LocationName']#往地址大列表中添加数据self.addr_list.append(LocationName)#工作属性CategoryName=data['CategoryName']#往工作属性大列表中添加数据self.category_list.append(CategoryName)#岗位名称RecruitPostName=data['RecruitPostName']#岗位职责Responsibility=data['Responsibility']#发布时间LastUpdateTime=data['LastUpdateTime']#岗位地址PostURL=data['PostURL']

5.数据生成折线图、饼图、散点图、柱状图

#第一张图:根据岗位地址和岗位属性二者数量生成折线图#146,147两行代码解决图中中文显示问题plt.rcParams['font.sans-serif']=['SimHei']plt.rcParams['axes.unicode_minus']=False#由于二者数据数量不统一,在此进行切片操作x_axis_data=[iforiinaddr_dict.values()][:5]y_axis_data=[iforiincate_dict.values()][:5]#print(x_axis_data,y_axis_data)#plot中参数的含义分别是横轴值,纵轴值,线的形状,颜色,透明度,线的宽度和标签plt.plot(y_axis_data,x_axis_data,'ro-',color='#4169E1',alpha=0.8,linewidth=1,label='数量')#显示标签,如果不加这句,即使在plot中加了label='一些数字'的参数,最终还是不会显示标签plt.legend(loc="upperright")plt.xlabel('地点数量')plt.ylabel('工作属性数量')plt.savefig('根据岗位地址和岗位属性二者数量生成折线图.png')plt.show()

#第二张图:根据岗位地址数量生成饼图"""工作地址饼图"""addr_dict_key=[kforkinaddr_dict.keys()]addr_dict_value=[vforvinaddr_dict.values()]plt.rcParams['font.sans-serif']=['MicrosoftYaHei']plt.rcParams['axes.unicode_minus']=Falseplt.pie(addr_dict_value,labels=addr_dict_key,autopct='%1.1f%%')plt.title(f'岗位地址和岗位属性百分比分布')plt.savefig(f'岗位地址和岗位属性百分比分布-饼图')plt.show()

#第三张图:根据岗位地址和岗位属性二者数量生成散点图#这两行代码解决plt中文显示的问题plt.rcParams['font.sans-serif']=['SimHei']plt.rcParams['axes.unicode_minus']=False#输入岗位地址和岗位属性数据production=[iforiindata.keys()]tem=[iforiindata.values()]colors=np.random.rand(len(tem))#颜色数组plt.scatter(tem,production,s=200,c=colors)#画散点图,大小为200plt.xlabel('数量')#横坐标轴标题plt.ylabel('名称')#纵坐标轴标题plt.savefig(f'岗位地址和岗位属性散点图')plt.show()

#第四张图:根据岗位地址和岗位属性二者数量生成柱状图importmatplotlib;matplotlib.use('TkAgg')plt.rcParams['font.sans-serif']=['SimHei']plt.rcParams['axes.unicode_minus']=Falsezhfont1=matplotlib.font_manager.FontProperties(fname='C:\Windows\Fonts\simsun.ttc')name_list=[namefornameindata.keys()]num_list=[valueforvalueindata.values()]width=0.5#柱子的宽度index=np.arange(len(name_list))plt.bar(index,num_list,width,color='steelblue',tick_label=name_list,label='岗位数量')plt.legend(['分解能耗','真实能耗'],prop=zhfont1,labelspacing=1)fora,binzip(index,num_list):#柱子上的数字显示plt.text(a,b,'%.2f'%b,ha='center',va='bottom',fontsize=7)plt.xticks(rotation=270)plt.title('岗位数量和岗位属性数量柱状图')plt.ylabel('次')plt.legend()plt.savefig(f'岗位数量和岗位属性数量柱状图-柱状图',bbox_inches='tight')plt.show()

源码展示

"""ua大列表"""USER_AGENT_LIST=['Mozilla/5.0(WindowsNT6.2;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/60.0.3112.90Safari/537.36','Mozilla/5.0(Macintosh;IntelMacOSX10_13_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/69.0.3451.0Safari/537.36','Mozilla/5.0(Macintosh;IntelMacOSX10.9;rv:57.0)Gecko/20100101Firefox/57.0','Mozilla/5.0(WindowsNT5.1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/28.0.1500.71Safari/537.36','Mozilla/5.0(Macintosh;IntelMacOSX10_12_3)AppleWebKit/537.36(KHTML,likeGecko)Chrome/58.0.2999.0Safari/537.36','Mozilla/5.0(WindowsNT6.3;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/53.0.2785.70Safari/537.36','Mozilla/5.0(Macintosh;U;IntelMacOSX10.4;en-US;rv:1.9.2.2)Gecko/20100316Firefox/3.6.2','Mozilla/5.0(WindowsNT5.1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/44.0.2403.155Safari/537.36OPR/31.0.1889.174','Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;.NETCLR1.1.4322;MS-RTCLM8;InfoPath.2;TabletPC2.0)','Mozilla/5.0(Macintosh;IntelMacOSX10_15_3)AppleWebKit/537.36(KHTML,likeGecko)Chrome/75.0.3770.100Safari/537.36','Mozilla/5.0(Macintosh;IntelMacOSX10_12_6)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36OPR/55.0.2994.61','Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/535.1(KHTML,likeGecko)Chrome/14.0.814.0Safari/535.1','Mozilla/5.0(Macintosh;U;PPCMacOSX;ja-jp)AppleWebKit/418.9.1(KHTML,likeGecko)Safari/419.3','Mozilla/5.0(Macintosh;IntelMacOSX10_10_5)AppleWebKit/537.36(KHTML,likeGecko)Chrome/43.0.2357.134Safari/537.36','Mozilla/5.0(compatible;MSIE10.0;WindowsNT6.1;Trident/6.0;Touch;MASMJS)','Mozilla/5.0(X11;Linuxi686)AppleWebKit/535.21(KHTML,likeGecko)Chrome/19.0.1041.0Safari/535.21','Mozilla/5.0(Macintosh;IntelMacOSX10_15_3)AppleWebKit/537.36(KHTML,likeGecko)Chrome/69.0.3497.100Safari/537.36','Mozilla/5.0(WindowsNT6.2;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/60.0.3112.90Safari/537.36','Mozilla/5.0(Macintosh;IntelMacOSX10_13_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/69.0.3451.0Safari/537.36','Mozilla/5.0(Macintosh;IntelMacOSX10.9;rv:57.0)Gecko/20100101Firefox/57.0','Mozilla/5.0(WindowsNT5.1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/28.0.1500.71Safari/537.36','Mozilla/5.0(Macintosh;IntelMacOSX10_12_3)AppleWebKit/537.36(KHTML,likeGecko)Chrome/58.0.2999.0Safari/537.36','Mozilla/5.0(WindowsNT6.3;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/53.0.2785.70Safari/537.36','Mozilla/5.0(Macintosh;U;IntelMacOSX10.4;en-US;rv:1.9.2.2)Gecko/20100316Firefox/3.6.2','Mozilla/5.0(WindowsNT5.1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/44.0.2403.155Safari/537.36OPR/31.0.1889.174','Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.1;Trident/4.0;SLCC2;.NETCLR2.0.50727;.NETCLR3.5.30729;.NETCLR3.0.30729;.NETCLR1.1.4322;MS-RTCLM8;InfoPath.2;TabletPC2.0)','Mozilla/5.0(Macintosh;IntelMacOSX10_15_3)AppleWebKit/537.36(KHTML,likeGecko)Chrome/75.0.3770.100Safari/537.36','Mozilla/5.0(Macintosh;IntelMacOSX10_12_6)AppleWebKit/537.36(KHTML,likeGecko)Chrome/68.0.3440.106Safari/537.36OPR/55.0.2994.61','Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/535.1(KHTML,likeGecko)Chrome/14.0.814.0Safari/535.1','Mozilla/5.0(Macintosh;U;PPCMacOSX;ja-jp)AppleWebKit/418.9.1(KHTML,likeGecko)Safari/419.3','Mozilla/5.0(Macintosh;IntelMacOSX10_10_5)AppleWebKit/537.36(KHTML,likeGecko)Chrome/43.0.2357.134Safari/537.36','Mozilla/5.0(compatible;MSIE10.0;WindowsNT6.1;Trident/6.0;Touch;MASMJS)','Mozilla/5.0(X11;Linuxi686)AppleWebKit/535.21(KHTML,likeGecko)Chrome/19.0.1041.0Safari/535.21','Mozilla/5.0(Macintosh;IntelMacOSX10_15_3)AppleWebKit/537.36(KHTML,likeGecko)Chrome/69.0.3497.100Safari/537.36','Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/83.0.4093.3Safari/537.36','Mozilla/5.0(Macintosh;IntelMacOSX10_14_5)AppleWebKit/537.36(KHTML,likeGecko;compatible;Swurl)Chrome/77.0.3865.120Safari/537.36','Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/87.0.4280.88Safari/537.36','Mozilla/5.0(Macintosh;IntelMacOSX10_14_5)AppleWebKit/537.36(KHTML,likeGecko)Chrome/87.0.4280.88Safari/537.36','Mozilla/5.0(Macintosh;IntelMacOSX10_14_6)AppleWebKit/537.36(KHTML,likeGecko)Chrome/74.0.3729.131Safari/537.36','Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/83.0.4086.0Safari/537.36','Mozilla/5.0(WindowsNT6.1;WOW64;rv:75.0)Gecko/20100101Firefox/75.0','Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)coc_coc_browser/91.0.146Chrome/85.0.4183.146Safari/537.36','Mozilla/5.0(Windows;U;WindowsNT5.2;en-US)AppleWebKit/537.36(KHTML,likeGecko)Safari/537.36VivoBrowser/8.4.72.0Chrome/62.0.3202.84','Mozilla/5.0(WindowsNT6.3;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/87.0.4280.101Safari/537.36','Mozilla/5.0(Macintosh;IntelMacOSX10_15_7)AppleWebKit/537.36(KHTML,likeGecko)Chrome/87.0.4280.88Safari/537.36Edg/87.0.664.60','Mozilla/5.0(Macintosh;IntelMacOSX10.16;rv:83.0)Gecko/20100101Firefox/83.0','Mozilla/5.0(X11;CrOSx86_6413505.63.0)AppleWebKit/537.36(KHTML,likeGecko)Chrome/87.0.4280.88Safari/537.36','Mozilla/5.0(Macintosh;IntelMacOSX10.9;rv:68.0)Gecko/20100101Firefox/68.0','Mozilla/5.0(Macintosh;IntelMacOSX10_15_7)AppleWebKit/537.36(KHTML,likeGecko)Chrome/87.0.4280.101Safari/537.36','Mozilla/5.0(Macintosh;IntelMacOSX10_15_1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/87.0.4280.88Safari/537.36','Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/86.0.4240.198Safari/537.36OPR/72.0.3815.400','Mozilla/5.0(X11;Linuxx86_64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/87.0.4280.101Safari/537.36',]fromrequests_htmlimportHTMLSessionimportos,xlwt,xlrd,randomfromxlutils.copyimportcopyimportnumpyasnpfrommatplotlibimportpyplotaspltfrommatplotlib.font_managerimportFontProperties#字体库importtimesession=HTMLSession()classTXSpider(object):def__init__(self):#起始的请求地址self.start_url='https://careers.tencent.com/tencentcareer/api/post/Query'#起始的翻页页码self.start_page=1#翻页条件self.is_running=True#准备工作地点大列表self.addr_list=[]#准备岗位种类大列表self.category_list=[]defparse_start_url(self):"""解析起始的url地址:return:"""#条件循环模拟翻页whileself.is_running:#构造请求参数params={#捕捉当前时间戳'timestamp':str(int(time.time()*1000)),'countryId':'','cityId':'','bgIds':'','productId':'','categoryId':'','parentCategoryId':'','attrId':'','keyword':'','pageIndex':str(self.start_page),'pageSize':'10','language':'zh-cn','area':'cn'}headers={'user-agent':random.choice(USER_AGENT_LIST)}response=session.get(url=self.start_url,headers=headers,params=params).json()"""调用解析响应方法"""self.parse_response_json(response)"""翻页递增"""self.start_page+=1"""翻页终止条件"""ifself.start_page==20:self.is_running=False"""翻页完成,开始生成分析图"""self.crate_img_four_func()defcrate_img_four_func(self):"""生成四张图方法:return:"""#统计数量data={}#大字典addr_dict={}#工作地址字典cate_dict={}#工作属性字典fork_addr,v_cateinzip(self.addr_list,self.category_list):ifk_addrindata:#大字典统计工作地址数据data[k_addr]=data[k_addr]+1#地址字典统计数据addr_dict[k_addr]=addr_dict[k_addr]+1else:data[k_addr]=1addr_dict[k_addr]=1ifv_cateindata:#大字典统计工作属性数据data[v_cate]=data[v_cate]+1#工作属性字典统计数据cate_dict[v_cate]=data[v_cate]+1else:data[v_cate]=1cate_dict[v_cate]=1#第一张图:根据岗位地址和岗位属性二者数量生成折线图#146,147两行代码解决图中中文显示问题plt.rcParams['font.sans-serif']=['SimHei']plt.rcParams['axes.unicode_minus']=False#由于二者数据数量不统一,在此进行切片操作x_axis_data=[iforiinaddr_dict.values()][:5]y_axis_data=[iforiincate_dict.values()][:5]#print(x_axis_data,y_axis_data)#plot中参数的含义分别是横轴值,纵轴值,线的形状,颜色,透明度,线的宽度和标签plt.plot(y_axis_data,x_axis_data,'ro-',color='#4169E1',alpha=0.8,linewidth=1,label='数量')#显示标签,如果不加这句,即使在plot中加了label='一些数字'的参数,最终还是不会显示标签plt.legend(loc="upperright")plt.xlabel('地点数量')plt.ylabel('工作属性数量')plt.savefig('根据岗位地址和岗位属性二者数量生成折线图.png')plt.show()#第二张图:根据岗位地址数量生成饼图"""工作地址饼图"""addr_dict_key=[kforkinaddr_dict.keys()]addr_dict_value=[vforvinaddr_dict.values()]plt.rcParams['font.sans-serif']=['MicrosoftYaHei']plt.rcParams['axes.unicode_minus']=Falseplt.pie(addr_dict_value,labels=addr_dict_key,autopct='%1.1f%%')plt.title(f'岗位地址和岗位属性百分比分布')plt.savefig(f'岗位地址和岗位属性百分比分布-饼图')plt.show()#第三张图:根据岗位地址和岗位属性二者数量生成散点图#这两行代码解决plt中文显示的问题plt.rcParams['font.sans-serif']=['SimHei']plt.rcParams['axes.unicode_minus']=False#输入岗位地址和岗位属性数据production=[iforiindata.keys()]tem=[iforiindata.values()]colors=np.random.rand(len(tem))#颜色数组plt.scatter(tem,production,s=200,c=colors)#画散点图,大小为200plt.xlabel('数量')#横坐标轴标题plt.ylabel('名称')#纵坐标轴标题plt.savefig(f'岗位地址和岗位属性散点图')plt.show()#第四张图:根据岗位地址和岗位属性二者数量生成柱状图importmatplotlib;matplotlib.use('TkAgg')plt.rcParams['font.sans-serif']=['SimHei']plt.rcParams['axes.unicode_minus']=Falsezhfont1=matplotlib.font_manager.FontProperties(fname='C:\Windows\Fonts\simsun.ttc')name_list=[namefornameindata.keys()]num_list=[valueforvalueindata.values()]width=0.5#柱子的宽度index=np.arange(len(name_list))plt.bar(index,num_list,width,color='steelblue',tick_label=name_list,label='岗位数量')plt.legend(['分解能耗','真实能耗'],prop=zhfont1,labelspacing=1)fora,binzip(index,num_list):#柱子上的数字显示plt.text(a,b,'%.2f'%b,ha='center',va='bottom',fontsize=7)plt.xticks(rotation=270)plt.title('岗位数量和岗位属性数量柱状图')plt.ylabel('次')plt.legend()plt.savefig(f'岗位数量和岗位属性数量柱状图-柱状图',bbox_inches='tight')plt.show()defparse_response_json(self,response):"""解析响应:paramresponse::return:"""#获取岗位信息大列表json_data=response['Data']['Posts']#判断结果是否有数据ifjson_dataisNone:#没有数据,设置循环条件为Falseself.is_running=False#反之,开始提取数据else:#循环遍历,取出列表中的每一个岗位字典#通过key取value值的方法进行采集数据fordatainjson_data:#工作地点LocationName=data['LocationName']#往地址大列表中添加数据self.addr_list.append(LocationName)#工作属性CategoryName=data['CategoryName']#往工作属性大列表中添加数据self.category_list.append(CategoryName)#岗位名称RecruitPostName=data['RecruitPostName']#岗位职责Responsibility=data['Responsibility']#发布时间LastUpdateTime=data['LastUpdateTime']#岗位地址PostURL=data['PostURL']#构造保存excel所需要的格式字典data_dict={#该字典的key值与创建工作簿的sheet表的名称所关联'岗位详情':[RecruitPostName,LocationName,CategoryName,Responsibility,LastUpdateTime,PostURL]}"""调用保存excel表格方法,数据字典作为参数"""self.save_excel(data_dict)#提示输出print(f"第{self.start_page}页--岗位{RecruitPostName}----采集完成----logging!!!")defsave_excel(self,data_dict):"""保存excel:paramdata_dict:数据字典:return:"""#判断保存到当我文件目录的路径是否存在os_path_1=os.getcwd()+'/数据/'ifnotos.path.exists(os_path_1):#不存在,即创建这个目录,即创建”数据“这个文件夹os.mkdir(os_path_1)#判断将数据保存到表格的这个表格是否存在,不存在,创建表格,写入表头os_path=os_path_1+'腾讯招聘数据.xls'ifnotos.path.exists(os_path):#创建新的workbook(其实就是创建新的excel)workbook=xlwt.Workbook(encoding='utf-8')#创建新的sheet表worksheet1=workbook.add_sheet("岗位详情",cell_overwrite_ok=True)excel_data_1=('岗位名称','工作地点','工作属性','岗位职责','发布时间','岗位地址')foriinrange(0,len(excel_data_1)):worksheet1.col(i).width=2560*3#行,列,内容,样式worksheet1.write(0,i,excel_data_1[i])workbook.save(os_path)#判断工作表是否存在#存在,开始往表格中添加数据(写入数据)ifos.path.exists(os_path):#打开工作薄workbook=xlrd.open_workbook(os_path)#获取工作薄中所有表的个数sheets=workbook.sheet_names()foriinrange(len(sheets)):fornameindata_dict.keys():worksheet=workbook.sheet_by_name(sheets[i])#获取工作薄中所有表中的表名与数据名对比ifworksheet.name==name:#获取表中已存在的行数rows_old=worksheet.nrows#将xlrd对象拷贝转化为xlwt对象new_workbook=copy(workbook)#获取转化后的工作薄中的第i张表new_worksheet=new_workbook.get_sheet(i)fornuminrange(0,len(data_dict[name])):new_worksheet.write(rows_old,num,data_dict[name][num])new_workbook.save(os_path)defrun(self):"""启动运行:return:"""self.parse_start_url()if__name__=='__main__':#创建该类的对象t=TXSpider()#通过实例方法,进行调用t.run()

到此,相信大家对“Python怎么实现爬取腾讯招聘网岗位信息”有了更深的了解,不妨来实际操作一番吧!这里是亿速云网站,更多相关内容可以进入相关频道进行查询,关注我们,继续学习!

发布于 2022-01-05 23:35:04
收藏
分享
海报
0 条评论
37
上一篇:css怎么实现动画 下一篇:JS的script标签属性有哪些
目录

    推荐阅读

    0 条评论

    本站已关闭游客评论,请登录或者注册后再评论吧~

    忘记密码?

    图形验证码