【蛋痛】吧饭否的老底翻出来

蛋痛的饭否搜索抓取,开始的时候被饭否耍了一下,url里面有个p的参数是控制最大搜索数量的。
抓取结果保存为python模块(Python3.0)

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

import urllib.request
import re

key_word = '番茄操蛋'
key_enc = urllib.parse.quote(key_word)

#不用代理的无视
proxy_support = urllib.request.ProxyHandler({"http" : "http://192.168.60.250:8080"})
opener = urllib.request.build_opener(proxy_support)

urllib.request.install_opener(opener)

def load(url):
 f = urllib.request.urlopen('http://fanfou.com' + url)
 resp = f.read().decode("utf-8")
 f.close()
 return resp

user_re = re.compile(r'<a href="/(?P<uid>[^"]+?)" title="(?P<nick>[^"]+?)" class="avatar"><img src="(?P<avatar>[^"]+?)".+?<span class="content">(?P<content>.+?)</span>.+?<a href="/statuses/(?P<mid>[^"]+?)" class="time" title="(?P<time>[^"]+?)">')
next_re = re.compile(r'<a href="(?P<url>[^"]+?)">下一页</a>')

collected_data = [];
import codecs
f = codecs.open('output_'+key_word+'.py','w','utf-8')
f.write('''
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
[
''')
def parse_page(text):
 for m in user_re.finditer(text):
  data = m.groupdict()
  f.write(str(data))
  f.write(',\n')
  collected_data.append(data)
  print(m.group('mid'),'@',m.group('time'))
  mid = m.group('mid')
  print('---')

 m = next_re.search(text)
 if m:
  return '/search?q='+key_enc+'&noframe=yes&m='+mid
 else:
  return None

url = '/search?q='+key_enc+'&noframe=yes'
while url:
 text = load(url)
 url = parse_page(text)
 print(url)

f.write(']\n')
f.close();
>_<|| 我手贱!!这个有问题!绝对有问题!!一般般啦,真的很一般般。还不错哦~小表扬一下!GJ!乃就是新世界的神様了,快去拯救世界吧! (No Ratings Yet)
Loading...

Comments are closed.