import urlparse
from os import sep, unlink, makedirs, rmdir
from os.path import splitext, dirname, isdir, exists
import urllib
import urllib2
from htmllib import HTMLParser
from formatter import AbstractFormatter, DumbWriter
from cStringIO import StringIO
from string import replace, find, lower, index
from sys import argv
import shutil
class Retrieve(object):
def __init__(self, url):
self.url = url
self.fileName = self.getFileName(url)
self.user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1)'
def getFileName(self, url, defaultName = 'index.html'):
parseurl = urlparse.urlparse(url, 'http:', False)
path = parseurl[1] + parseurl[2]
ext = splitext(path)
if ext[1] == '':
if path[-1] == '/':
path += defaultName
else:
path += '/' + defaultName
ldir = dirname(path)
if not isdir(ldir):
if exists(ldir):
unlink(ldir)
totalDir = ''
while True:
try:
sepIndex = index(ldir, '/')
totalDir += ldir[0 : sepIndex]
if not isdir(totalDir):
if exists(totalDir):
unlink(totalDir)
makedirs(totalDir)
totalDir += '/'
ldir = ldir[sepIndex + 1:]
except ValueError:
totalDir += ldir
makedirs(totalDir)
break
return path
def download(self):
try:
headers = {'User-Agent' : self.user_agent}
req = urllib2.Request(self.url, headers = headers)
response = urllib2.urlopen(req)
retval = response.readlines()
f = open(self.fileName, 'w')
for str in retval:
f.write(str)
f.close()
except IOError:
retval = '***'
return retval
def parseAndGetLinks(self):
self.htmlParse = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
self.htmlParse.feed(open(self.fileName).read())
self.htmlParse.close()
return self.htmlParse.anchorlist
class Crawler(object):
def __init__(self, url):
self.url = url
self.urlQueue = [url]
self.urlSeenQueue = []
self.domain = urlparse.urlparse(url)[1]
if isdir(self.domain):
shutil.rmtree(self.domain)
def getPage(self, url):
r = Retrieve(url)
retVal = r.download()
if retVal[0] == '*':
return
urls = r.parseAndGetLinks()
for urlOne in urls:
if urlOne[:4] != 'http' and find(urlOne, '://') == -1:
urlOne = urlparse.urljoin(url, urlOne)
if find(lower(urlOne), 'mailto:') != -1:
continue
if urlOne not in self.urlSeenQueue:
if find(urlOne, self.domain) == -1:
continue
if (find(urlOne, '#comments') != -1):
continue
if (find(urlOne, 'li2818') == -1):
continue
if urlOne not in self.urlQueue and urlOne not in self.urlSeenQueue:
self.urlQueue.append(urlOne)
self.urlSeenQueue.append(url)
def testUseful(self, url):
fUrl = urllib.urlopen(url)
hCode = fUrl.getcode()
if hCode != 200:
return False
return True
def go(self):
while self.urlQueue:
url = self.urlQueue.pop()
#if self.testUseful(url) == False:
# continue
s = 'seen url' + url
print s
self.getPage(url)
def printSeen(self):
f = open('already_seen_url', 'w')
while self.urlSeenQueue:
f.write(self.urlSeenQueue.pop() + '\n')
def main():
#if len(argv) > 1:
# url = argv[1]
#else:
# try:
# url = raw_input('start with one url: ')
# except(KeyboardInterrupt, EOFError):
# url = ''
#if not url:
# return
#crawler = Crawler(url)
crawler = Crawler('http://blog.csdn.net/li2818')
#crawler = Crawler('http://www.hao123.com')
#crawler = Crawler('http://blog.csdn.net')
crawler.go()
crawler.printSeen()
print 'done!'
if __name__ == '__main__':
main()
一个简单的爬虫程序,包含请求头。
最新推荐文章于 2025-02-13 16:06:54 发布
本文介绍了一个简单的Python网页爬虫程序,该程序可以下载指定URL的内容并解析页面中的链接。此外,它还具备基本的错误处理功能,并能递归地抓取同一域名下的其他网页。

729

被折叠的 条评论
为什么被折叠?



