HTML/JS转换工具
Unicode编码转换
当前位置:首页 > 编程开发 > Python > 

pycurl模块抓取网页实例代码

来源:IT技术网编辑:雨天发布于:2017-01-22人围观

 

在找pycurl的使用方法时,对初次使用者,很困难,于是想写个简单的demo方便想涉足者使用:
import pycurl
import StringIO

url=’www.baidu.com’
c=pycurl.Curl()
c.setopt(c.URL, url)
b = StringIO.StringIO()
c.setopt(c.WRITEFUNCTION, b.write)
c.setopt(c.FOLLOWLOCATION, 1)
c.setopt(c.HEADER, True)
c.perform()
html=b.getvalue()
print html
b.close()
c.close()
=========================================================
def test(debug_type, debug_msg):
print “debug(%d): %s” % (debug_type, debug_msg)
curl会用到的一些方法:
c.setopt(c.HTTPHEADER, ["Content-Type: application/x-www-form-urlencoded","X-Requested-With:XMLHttpRequest","Cookie:"+set_cookie[0]])
c.setopt(c.REFERER, url)
c.setopt(c.POSTFIELDS, params)
c.setopt(c.VERBOSE, 1)
c.setopt(c.POST, 1)
c.setopt(c.DEBUGFUNCTION, test)
url = “http://www.cnn.com”
print “Starting downloading”, url
print
f = open(“body”, “wb”)
h = open(“header”, “wb”)
c = pycurl.Curl()
c.setopt(c.URL, url)
c.setopt(c.WRITEDATA, f)
c.setopt(c.NOPROGRESS, 0)
c.setopt(c.PROGRESSFUNCTION, progress)
c.setopt(c.FOLLOWLOCATION, 1)
c.setopt(c.MAXREDIRS, 5)
c.setopt(c.WRITEHEADER, h)
c.setopt(c.POST, 1)
c.setopt(c.OPT_FILETIME, 1)
c.perform()
print “HTTP-code:”, c.getinfo(c.HTTP_CODE)
print “Total-time:”, c.getinfo(c.TOTAL_TIME)
print “Download speed: %.2f bytes/second” % c.getinfo(c.SPEED_DOWNLOAD)
print “Document size: %d bytes” % c.getinfo(c.SIZE_DOWNLOAD)
print “Effective URL:”, c.getinfo(c.EFFECTIVE_URL)
print “Content-type:”, c.getinfo(c.CONTENT_TYPE)
print “Namelookup-time:”, c.getinfo(c.NAMELOOKUP_TIME)
print “Redirect-time:”, c.getinfo(c.REDIRECT_TIME)
print “Redirect-count:”, c.getinfo(c.REDIRECT_COUNT)
epoch = c.getinfo(c.INFO_FILETIME)
#print “Filetime: %d (%s)” % (epoch, time.ctime(epoch))
#print
print “Header is in file ‘header’, body is in file ‘body’”
c.close()
f.close()
h.close()
#print pycurl.version_info()
url=”
c=pycurl.Curl()
c.setopt(pycurl.URL, url);
b = StringIO.StringIO()
c.setopt(pycurl.HTTPHEADER, ["Accept:"])
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 2)
#c.setopt(pycurl.HEADER, True)
c.setopt(pycurl.MAXREDIRS, 5)
#c.setopt(pycurl.USERAGENT, “Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)”)
#c.setopt(pycurl.REFERER, “”)
#c.setopt(pycurl.CONNECTTIMEOUT, 20)#链接超时
#c.setopt(pycurl.TIMEOUT, 20)#下载超时
#c.setopt(pycurl.COOKIEFILE, “cookie_file_name”)
#c.setopt(pycurl.COOKIEJAR, “cookie_file_name”)
c.perform()
#print ret
html=b.getvalue()
print ‘———–’
print html
========================代理使用

defgetURLContent_pycurl(url):

c = pycurl.Curl()

c.setopt(pycurl.URL,url)

b = StringIO.StringIO()

c.setopt(pycurl.WRITEFUNCTION, b.write)

c.setopt(pycurl.FOLLOWLOCATION, 1)

c.setopt(pycurl.MAXREDIRS, 5)

#代理

#c.setopt(pycurl.PROXY, ‘http://11.11.11.11:8080′)

#c.setopt(pycurl.PROXYUSERPWD, ‘aaa:aaa’)

c.perform()

returnb.getvalue()

url =’http://blog.csdn.net’

content = getURLContent_pycurl(url)

printcontent


织梦二维码生成器

与相关的文章
有时间的话来看看IT界的突发事件
发表评论
请自觉遵守互联网相关的政策法规,严禁发布色情、暴力、反动的言论。
评价:
表情:
用户名: 验证码:点击我更换图片