日本国产在线视频,精品一区二区三区免费观看,成人免费视频在线

本文實(shí)例講述了python 提取頁(yè)面內(nèi)url列表的方法。分享給大家供大家參考。具體實(shí)現(xiàn)方法如下：

									from bs4 import BeautifulSoup

									import time,re,urllib2

									t=time.time()

									websiteurls={}

									def scanpage(url):

									  websiteurl=url

									  t=time.time()

									  n=0

									  html=urllib2.urlopen(websiteurl).read()

									  soup=BeautifulSoup(html)

									  pageurls=[]

									  Upageurls={}

									  pageurls=soup.find_all("a",href=True)

									  for links in pageurls:

									    if websiteurl in links.get("href") and links.get("href") not in Upageurls and links.get("href") not in websiteurls:

									      Upageurls[links.get("href")]=0

									  for links in Upageurls.keys():

									    try:

									      urllib2.urlopen(links).getcode()

									    except:

									      print "connect failed"

									    else:

									      t2=time.time()

									      Upageurls[links]=urllib2.urlopen(links).getcode()

									      print n,

									      print links,

									      print Upageurls[links]

									      t1=time.time()

									      print t1-t2

									    n+=1

									  print ("total is "+repr(n)+" links")

									  print time.time()-t

									scanpage("http://news.163.com/")