1 # -*- coding: utf-8 -*- 2 # author:zxy 3 #Date:2018-9-23 4 5 from lxml import etree 6 import requests 7 8 BASE_DOMAIN="http://hr.tencent.com/" 9 HEADERS = {10 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '11 'AppleWebKit/537.36 (KHTML, like Gecko)'12 ' Chrome/67.0.3396.99 Safari/537.36'13 }14 BASE_URL="https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start=0"15 16 def parse_detail_page(url):17 position={}18 response=requests.get(url,headers=HEADERS)19 html=etree.HTML(response.text)20 work_name=html.xpath("//tr[@class='h']/td/text()")[0]21 work_place=html.xpath("//tr[@class='c bottomline']/td[1]/text()")[0]22 work_category=html.xpath("//tr[@class='c bottomline']/td[2]/text()")[0]23 work_lack_number=html.xpath("//tr[@class='c bottomline']/td[3]/text()")[0]24 # print(work_lack_number)25 more_infos=html.xpath("//ul[@class='squareli']")26 work_duty=more_infos[0].xpath(".//text()")27 work_require=more_infos[1].xpath(".//text()")28 29 position['work_name']=work_name30 position['work_place']=work_place31 position['work_category']=work_category32 position['work_lack_number']=work_lack_number33 position['work_duty']=work_duty34 position['work_require']=work_require35 36 return position37 38 def get_detail_urls(url):39 response=requests.get(url=BASE_URL,headers=HEADERS)40 text=response.text41 html=etree.HTML(text)42 links=html.xpath("//tr[@class='even']//a/@href")43 links=map(lambda url:BASE_DOMAIN+url,links)44 return links45 46 def spider():47 base_url="https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start={}#a"48 positions=[]49 for x in range(0,4): #4350 x*=1051 url=base_url.format(x)52 detail_urls=get_detail_urls(url)53 for detail_url in detail_urls:54 position=parse_detail_page(detail_url)55 positions.append(position)56 #print(position)57 with open('tecentRecruit.txt','a',encoding='utf-8') as f:58 for (key,value) in position.items():59 if(key=='work_duty'):60 str='work_duty :{}'61 f.write(str.format(value))62 f.write('\n')63 elif(key=='work_require'):64 str="work_require :{}"65 f.write(str.format(value))66 f.write('\n')67 else:68 f.write(key+":"+value)69 f.write('\n')70 f.write('\n'*3)71 72 #print(positions)73 74 if __name__ == '__main__':75 spider()
效果如图所示: