(python爬取中标信息)(python爬取项目简介)

源码如下,该网站搜索使用post payloda发送json数据提交,所以要注意。

#filename : getbigdatainfo.py

#!/usr/bin/env python

#coding=utf-8

import requests #python3.x

import os

import time

import sys

import json

import xlwt

#url = 'http://www.ccgp-xinjiang.gov.cn/es-articles/es-article/_search'

#url = 'http://www.ccgp-xinjiang.gov.cn/front/search/category'

def getHtml(url):

#该网站的关键词和控制参数通过post payloda发送json数据提交,另外注意通过浏览器取到的json中true要改成True,不然python会报错

#payload = {"from":0,"size":"50","query":{"bool":{"must":[{"term":{"siteId":{"value":"39","boost":1}}},{"multi_match":{"query":"大数据","fields":["title^1.0"],"type":"best_fields","operator":"OR","slop":0,"prefix_length":0,"max_expansions":50,"zero_terms_query":"NONE","auto_generate_synonyms_phrase_query":True,"fuzzy_transpositions":True,"boost":1}},{"wildcard":{"path":{"wildcard":"*6zcyannouncement30016*","boost":1}}}],"adjust_pure_negative":True,"boost":1,"should":[]}},"sort":[{"_score":{"order":"desc"}},{"publishDate":{"order":"desc"}},{"_id":{"order":"desc"}}],"_source":{"includes":["title","articleId","siteId","cover","url","pathName","publishDate","attachmentUrl","districtName","gpCatalogName"],"excludes":["content"]}}

payload = {"pageNo":1,"pageSize":"15","categoryCode":"ZcyAnnouncement3001","keyword":"大数据"}

headers = {'content-type': 'application/json',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',

'Accept':'*/*',

'Accept-Language':'zh-CN,zh;q=0.9',

'Connection':'keep-alive',

'Content-Length':'444',

'Host':'www.ccgp-xinjiang.gov.cn',

'Origin':'www.ccgp-xinjiang.gov.cn',

'X-Requested-With':'aXMLHttpRequest',

'Referer':'http://www.ccgp-xinjiang.gov.cn/ZcyAnnouncement/ZcyAnnouncement2/ZcyAnnouncement3001/index.html'

}

r = requests.post(url, data=json.dumps(payload), headers=headers)

print ("urlopen OK" )

html = r.text

#print (html)

return html

def getinfourl(html):

data = json.loads(html)

print(data.keys())

print(data['took'])

print(data['_shards'])

items=data['hits']['hits']

book = xlwt.Workbook()#新建一个excel

sheet = book.add_sheet('bigdata_sheet')#添加一个sheet页

sheet.write(0,0,'招标地区')

sheet.write(0,1,'招标类型')

sheet.write(0,2,'招标名称')

sheet.write(0,3,'详情地址')

sheet.write(0,4,'发布时间')

row = 1#控制行

homeurl='http://www.ccgp-xinjiang.gov.cn'

for item in items :

print(item['_source']['title'])

print(item['_source']['url'])

sheet.write(row,0,item['_source']['districtName'])

sheet.write(row,1,item['_source']['gpCatalogName'])

sheet.write(row,2,item['_source']['title'])

sheet.write(row,3,homeurl+item['_source']['url'])

publishdate_=item['_source']['publishDate']

print('publishdate is %s', publishdate_)

print('now time %s',time.time())

_time = time.localtime(publishdate_/1000)

publishdate = time.strftime("%Y-%m-%d %H:%M:%S", _time)

sheet.write(row,4,publishdate)

row+=1

filename='bigdatainfo_'+time.strftime("%Y%m%d%H%M%S", time.localtime())+'.xls'

book.save(filename)#保存到当前目录下

html = getHtml(sys.argv[1])

result = getinfourl(html)

使用方法,命令行执行

python getbigdatainfo.py http://www.ccgp-xinjiang.gov.cn/front/search/category

后面这个网址参数会变动,1个月前我写这个的时候还不是这个,今天使用就变了,更新了一下

声明:我要去上班所有作品(图文、音视频)均由用户自行上传分享,仅供网友学习交流,版权归原作者丝路阿凡提所有,原文出处。若您的权利被侵害,请联系删除。

本文标题:(python爬取中标信息)(python爬取项目简介)
本文链接:https://www.51qsb.cn/article/dvjru8.html

(0)
打赏微信扫一扫微信扫一扫QQ扫一扫QQ扫一扫
上一篇2023-08-18
下一篇2023-08-18

你可能还想知道

发表回复

登录后才能评论