(爬虫获取php代码)(学爬虫用不用php)

#coding=utf8

import urllib2

import codecs

import re

import time

from lxml import etree

url1 = 'http://521xunlei.com/portal.php'

path1 = '//*[@id="portal_block_62_content"]/div/ul/li[1]/a/@href'

path3 = '//*[@class="t_f"]/font/text()'

def geturlinfo(url,path,x):

request = urllib2.Request(url)

response = urllib2.urlopen(request)

result = response.read()

restree = etree.HTML(result)

nodes = restree.xpath(path)

if x == '1':

return nodes[0]

else:

i=0

open('thunder.txt','w').write('')

for node in nodes:

if re.search(':',node):

INFO = str(i)+': '+node.replace('\r\n','')

print INFO

open('thunder.txt','a').write(INFO.encode('utf8')+'\n')

i+=1

if __name__ == '__main__':

while True:

print '===================start===================\n'

url2 = 'http://'+url1.replace('http://','').split('/')[0]+'/'+geturlinfo(url1, path1,'1')

print 'GET From: '+url2

geturlinfo(url2, path3, '0')

time.sleep(24*3600)

#starts-with(@id,"test") id已test开头的

#首先获取对应div 再次xpath string(.) 组合

声明：我要去上班所有作品（图文、音视频）均由用户自行上传分享，仅供网友学习交流，版权归原作者Python乐园所有，原文出处。若您的权利被侵害，请联系删除。

本文标题：(爬虫获取php代码)(学爬虫用不用php)
本文链接：https://www.51qsb.cn/article/m9dlj.html