蛋痛的饭否搜索抓取,开始的时候被饭否耍了一下,url里面有个p的参数是控制最大搜索数量的。
抓取结果保存为python模块(Python3.0)
#!/usr/bin/env python # -*- coding: UTF-8 -*- import urllib.request import re key_word = '番茄操蛋' key_enc = urllib.parse.quote(key_word) #不用代理的无视 proxy_support = urllib.request.ProxyHandler({"http" : "http://192.168.60.250:8080"}) opener = urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) def load(url): f = urllib.request.urlopen('http://fanfou.com' + url) resp = f.read().decode("utf-8") f.close() return resp user_re = re.compile(r'<a href="/(?P<uid>[^"]+?)" title="(?P<nick>[^"]+?)" class="avatar"><img src="(?P<avatar>[^"]+?)".+?<span class="content">(?P<content>.+?)</span>.+?<a href="/statuses/(?P<mid>[^"]+?)" class="time" title="(?P<time>[^"]+?)">') next_re = re.compile(r'<a href="(?P<url>[^"]+?)">下一页</a>') collected_data = []; import codecs f = codecs.open('output_'+key_word+'.py','w','utf-8') f.write(''' #!/usr/bin/env python # -*- coding: UTF-8 -*- [ ''') def parse_page(text): for m in user_re.finditer(text): data = m.groupdict() f.write(str(data)) f.write(',\n') collected_data.append(data) print(m.group('mid'),'@',m.group('time')) mid = m.group('mid') print('---') m = next_re.search(text) if m: return '/search?q='+key_enc+'&noframe=yes&m='+mid else: return None url = '/search?q='+key_enc+'&noframe=yes' while url: text = load(url) url = parse_page(text) print(url) f.write(']\n') f.close();