源码如下,该网站搜索使用post payloda发送json数据提交,所以要注意。
#filename : getbigdatainfo.py
#!/usr/bin/env python
#coding=utf-8
import requests #python3.x
import os
import time
import sys
import json
import xlwt
#url = 'http://www.ccgp-xinjiang.gov.cn/es-articles/es-article/_search'
#url = 'http://www.ccgp-xinjiang.gov.cn/front/search/category'
def getHtml(url):
#该网站的关键词和控制参数通过post payloda发送json数据提交,另外注意通过浏览器取到的json中true要改成True,不然python会报错
#payload = {"from":0,"size":"50","query":{"bool":{"must":[{"term":{"siteId":{"value":"39","boost":1}}},{"multi_match":{"query":"大数据","fields":["title^1.0"],"type":"best_fields","operator":"OR","slop":0,"prefix_length":0,"max_expansions":50,"zero_terms_query":"NONE","auto_generate_synonyms_phrase_query":True,"fuzzy_transpositions":True,"boost":1}},{"wildcard":{"path":{"wildcard":"*6zcyannouncement30016*","boost":1}}}],"adjust_pure_negative":True,"boost":1,"should":[]}},"sort":[{"_score":{"order":"desc"}},{"publishDate":{"order":"desc"}},{"_id":{"order":"desc"}}],"_source":{"includes":["title","articleId","siteId","cover","url","pathName","publishDate","attachmentUrl","districtName","gpCatalogName"],"excludes":["content"]}}
payload = {"pageNo":1,"pageSize":"15","categoryCode":"ZcyAnnouncement3001","keyword":"大数据"}
headers = {'content-type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Accept':'*/*',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Content-Length':'444',
'Host':'www.ccgp-xinjiang.gov.cn',
'Origin':'www.ccgp-xinjiang.gov.cn',
'X-Requested-With':'aXMLHttpRequest',
'Referer':'http://www.ccgp-xinjiang.gov.cn/ZcyAnnouncement/ZcyAnnouncement2/ZcyAnnouncement3001/index.html'
}
r = requests.post(url, data=json.dumps(payload), headers=headers)
print ("urlopen OK" )
html = r.text
#print (html)
return html
def getinfourl(html):
data = json.loads(html)
print(data.keys())
print(data['took'])
print(data['_shards'])
items=data['hits']['hits']
book = xlwt.Workbook()#新建一个excel
sheet = book.add_sheet('bigdata_sheet')#添加一个sheet页
sheet.write(0,0,'招标地区')
sheet.write(0,1,'招标类型')
sheet.write(0,2,'招标名称')
sheet.write(0,3,'详情地址')
sheet.write(0,4,'发布时间')
row = 1#控制行
homeurl='http://www.ccgp-xinjiang.gov.cn'
for item in items :
print(item['_source']['title'])
print(item['_source']['url'])
sheet.write(row,0,item['_source']['districtName'])
sheet.write(row,1,item['_source']['gpCatalogName'])
sheet.write(row,2,item['_source']['title'])
sheet.write(row,3,homeurl+item['_source']['url'])
publishdate_=item['_source']['publishDate']
print('publishdate is %s', publishdate_)
print('now time %s',time.time())
_time = time.localtime(publishdate_/1000)
publishdate = time.strftime("%Y-%m-%d %H:%M:%S", _time)
sheet.write(row,4,publishdate)
row+=1
filename='bigdatainfo_'+time.strftime("%Y%m%d%H%M%S", time.localtime())+'.xls'
book.save(filename)#保存到当前目录下
html = getHtml(sys.argv[1])
result = getinfourl(html)
使用方法,命令行执行
python getbigdatainfo.py http://www.ccgp-xinjiang.gov.cn/front/search/category
后面这个网址参数会变动,1个月前我写这个的时候还不是这个,今天使用就变了,更新了一下
声明:我要去上班所有作品(图文、音视频)均由用户自行上传分享,仅供网友学习交流,版权归原作者丝路阿凡提所有,原文出处。若您的权利被侵害,请联系删除。
本文标题:(python爬取中标信息)(python爬取项目简介)
本文链接:https://www.51qsb.cn/article/dvjru8.html