小试牛刀Python抓取某站最新迅雷账号
小试牛刀,写了个Python抓取某站最新迅雷账号,众多bug欢迎大家指出~脚本运行方式,在终端执行python filename即可,下载链接-->Thunder_spider
以下是代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
#!/usr/bin/env python #coding:utf8 #Date:2016-05-08 13:03:39 from bs4 import BeautifulSoup import urllib2,re indexUrl = 'http://xlfans.com/archives/category/迅雷分享' header = {'User-Agent':'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'} def getHtml(url): request = urllib2.Request(url,headers=header) try: response = urllib2.urlopen(request,timeout=10).read() except Exception,e: print 'time out' exit() else: return response #获取最新帐号链接 index_doc = getHtml(indexUrl) indexSoup = BeautifulSoup(index_doc,'html.parser',from_encoding='utf-8') newLinks = indexSoup.article.a url = ''.join(re.findall(r'href="(.*)">',str(newLinks))) #--------start------------- html_doc = getHtml(url) soup = BeautifulSoup(html_doc,'html.parser',from_encoding='utf-8') d1 = {} l1 = [] #解密邮箱函数 def decodeEmail(e): de = "" k = int(e[:2], 16) for i in range(2, len(e)-1, 2): de += chr(int(e[i:i+2], 16)^k) return de no_email_list = [] def no_email(): for p_content in soup.find_all('p'): p_match = re.findall(r'迅雷粉迅雷会员账号.*',str(p_content.get_text().encode('utf-8')).replace(' ','')) if p_match:no_email_list.append(''.join(p_match)) for list_item in no_email_list: account = re.findall(r'迅雷粉迅雷会员账号(.*)密码?\W?.*',list_item) psword = re.findall(r'迅雷粉迅雷会员账号.*密码?\W?(.*)',list_item) d1[''.join(account)]=''.join(psword) return d1 def has_email(): firstResult = soup.article.script.parent.get_text().encode('utf-8').replace(' ','').replace(':','') #print firstResult for i in re.findall(r'账号\w+密码\w+|迅雷粉迅雷会员账号.*',firstResult): if not re.search(r'email',i): acSplit = re.split(r'密',i)[0] account = list(re.sub(r'\W+','',''.join(acSplit))) pSplit = re.split(r'密',i)[1] psword = list(re.sub(r'\W+','',''.join(pSplit))) #print account,psword d1[''.join(account)]=''.join(psword) else: ps2 = re.findall(r'迅雷粉迅雷会员账号.*密码?(.*)',i) l1.append(''.join(ps2)) #print l1 allMail = [decodeEmail(mail['data-cfemail']) for mail in soup.find_all('a',class_='__cf_email__')] d2 = dict(map(None,allMail,l1)) #print d2 d1.update(d2) return d1 def main(): for key in d1: print '迅雷帐号:',key,'密码:',d1[key] print '\033[01;32m==========Get {0} accounts!!===========\033[0m'.format(len(d1)) if soup.article.script == None: no_email() main() else: has_email() main() |