Python爬虫拉钩获取招聘信息(源码)

2021-06-27 · yuan · 原创 · · 本文共 1,015个字,预计阅读需要 4分钟。

前端怎样根据页面内容获取数据的来源?

复制要查询的内容,F12后刷新页面,在Network下进行search,搜索要查找的内容,此时有数据时,点击获取数据来源那个接口。

  1. import requests
  2. import time
  3. import pymysql
  4. conn = pymysql.connect(host="", user="root",password="",database="",charset="utf8")
  5. cursor = conn.cursor()
  6. main_url = 'https://www.lagou.com/jobs/list_java?labelWords=&fromSearch=true&suginput='
  7. headers_0 = {
  8. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
  9. }
  10. cookies_response = requests.get(main_url,headers=headers_0)
  11. print(cookies_response.cookies)
  12. api_url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
  13. sid = ''
  14. for page in range(1,31):
  15. if page == 1:
  16. data = {
  17. 'first': 'true',
  18. 'pn': '1',
  19. 'kd': 'python'
  20. }
  21. else:
  22. data = {
  23. 'first': 'false',
  24. 'pn': str(page),
  25. 'kd': 'python',
  26. 'sig': sid
  27. }
  28. header = {
  29. "authority": "www.lagou.com",
  30. "cookie": "user_trace_token=20210627151414-7baf440f-d4d8-4cf8-b2cd-9f8d094cba25; JSESSIONID=ABAAAECAAEBABII9B80B115A7B06EAB4727D0C8443CA7F3; LGUID=20210627151414-d68b5f81-6a4f-44b6-8364-a2a954037383; sajssdk_2015_cross_new_user=1; _ga=GA1.2.997903917.1624778056; _gid=GA1.2.1810296173.1624778056; WEBTJ-ID=20210627%E4%B8%8B%E5%8D%883:14:22151422-17a4c51b9354e-0cf41873464a2-6373267-1327104-17a4c51b936da6; RECOMMEND_TIP=true; privacyPolicyPopup=false; LGSID=20210627151421-35ebc2b6-6a91-4e62-9d01-89ff906d6cd3; sensorsdata2015session=%7B%7D; index_location_city=%E5%85%A8%E5%9B%BD; __lg_stoken__=d933d16a05691c24fa63ff3b430203a067d5d31f537dff3c88d798b73a06c8d672365afa132fc359df75c4260b52c2c082917561f9d985756c99339f0e6eb941518a3c6192ec; X_MIDDLE_TOKEN=a8b51f2cd6f6374bbd33a90d35fc9a99; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1624778396,1624778846,1624778857,1624778880; gate_login_token=ba2a2c73430323a943d25a84017ba53fc6db412cbe3e0515; _putrc=D7115C3DCB8E3D87; login=true; unick=%E8%A2%81%E5%8A%B2%E6%9D%BE; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=143; __SAFETY_CLOSE_TIME__9136890=1; TG-TRACK-CODE=index_checkmore; _gat=1; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%229136890%22%2C%22%24device_id%22%3A%2217a4c51a1d9353-0780f782aac12e-6373267-1327104-17a4c51a1da416%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2291.0.4472.114%22%2C%22lagou_company_id%22%3A%22%22%7D%2C%22first_id%22%3A%2217a4c51a1d9353-0780f782aac12e-6373267-1327104-17a4c51a1da416%22%7D; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1624780861; LGRID=20210627160100-9b1967b3-d0c0-49f8-a8c6-3bbd773f3392; X_HTTP_TOKEN=0a92058e2570a6b907808742614489fb89a8fb2e8f; SEARCH_ID=23027ec4ba9246c79885084b4b4eae65",
  31. "origin": "https://www.lagou.com",
  32. "referer": "https://www.lagou.com/jobs/list_java/p-city_0?&cl=false&fromSearch=true&labelWords=&suginput=",
  33. "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
  34. }
  35. print(data)
  36. if page%6 == 0:
  37. time.sleep(10)
  38. response = requests.post(api_url,
  39. headers=header,
  40. #cookies=cookies_response.cookies,
  41. data=data)
  42. data = response.json()
  43. result = data['content']['positionResult']['result']
  44. sid = data['content']['showId']
  45. for r in result:
  46. d = {
  47. 'city': r['city'],
  48. 'companyFullName': r['companyFullName'],
  49. 'companySize': r['companySize'],
  50. 'education': r['education'],
  51. 'positionName': r['positionName'],
  52. 'salary': r['salary'],
  53. 'workYear': r['workYear']
  54. # 'positionDetail': r['positionDetail']
  55. }
  56. # 要执行的SQL语句
  57. print(r['createTime'])
  58. sql = "INSERT INTO resource_collection_job (platform,position_name,publish_time,city,company_full_name,company_size,education,salary,work_year,position_detail) VALUES ('拉勾', %s,%s,%s,%s,%s,%s,%s,%s,%s)"
  59. # 执行操作
  60. cursor.execute(sql,(r['positionName'],r['createTime'],r['city'],r['companyFullName'],r['companySize'],r['education'],r['salary'],r['workYear'],r['positionDetail']))
  61. #with open('lagou.csv',mode='a',encoding='utf-8-sig') as f:
  62. #f.write(",".join(list(d.values()))+'\n')
  63. # 提交事务
  64. conn.commit()
  65. print('完成')