小试牛刀Python抓取某站最新迅雷账号

作者：Hui
日期: 2016-05-10
分类: Python
浏览次数： 501

小试牛刀，写了个Python抓取某站最新迅雷账号，众多bug欢迎大家指出~脚本运行方式，在终端执行python filename即可，下载链接-->Thunder_spider

以下是代码:

#!/usr/bin/env python
#coding:utf8
#Date:2016-05-08 13:03:39

from bs4 import BeautifulSoup
import urllib2,re
indexUrl = 'http://xlfans.com/archives/category/迅雷分享'
header = {'User-Agent':'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'}

def getHtml(url):
    request = urllib2.Request(url,headers=header)
    try:
        response = urllib2.urlopen(request,timeout=10).read()
    except Exception,e:
        print 'time out'
	exit()
    else:
        return response

#获取最新帐号链接
index_doc = getHtml(indexUrl)
indexSoup = BeautifulSoup(index_doc,'html.parser',from_encoding='utf-8')
newLinks = indexSoup.article.a
url = ''.join(re.findall(r'href="(.*)">',str(newLinks)))

#--------start-------------
html_doc = getHtml(url)
soup = BeautifulSoup(html_doc,'html.parser',from_encoding='utf-8')

d1 = {}
l1 = []

#解密邮箱函数
def decodeEmail(e):
    de = ""
    k = int(e[:2], 16)
    for i in range(2, len(e)-1, 2):
        de += chr(int(e[i:i+2], 16)^k)
    return de

no_email_list = []
def no_email():
    for p_content in soup.find_all('p'):
        p_match = re.findall(r'迅雷粉迅雷会员账号.*',str(p_content.get_text().encode('utf-8')).replace(' ',''))
	if p_match:no_email_list.append(''.join(p_match))
    for list_item in no_email_list:
        account = re.findall(r'迅雷粉迅雷会员账号(.*)密码?\W?.*',list_item)
        psword = re.findall(r'迅雷粉迅雷会员账号.*密码?\W?(.*)',list_item)
	d1[''.join(account)]=''.join(psword)
    return d1

def has_email():
    firstResult = soup.article.script.parent.get_text().encode('utf-8').replace(' ','').replace('：','')
    #print firstResult
    for i in re.findall(r'账号\w+密码\w+|迅雷粉迅雷会员账号.*',firstResult):
        if not re.search(r'email',i):
            acSplit = re.split(r'密',i)[0]
	    account = list(re.sub(r'\W+','',''.join(acSplit)))
	    pSplit = re.split(r'密',i)[1]
            psword = list(re.sub(r'\W+','',''.join(pSplit)))
            #print account,psword
            d1[''.join(account)]=''.join(psword)
	else:
	    ps2 = re.findall(r'迅雷粉迅雷会员账号.*密码?(.*)',i)
	    l1.append(''.join(ps2))
    #print l1
    allMail = [decodeEmail(mail['data-cfemail']) for mail in soup.find_all('a',class_='__cf_email__')]
    d2 = dict(map(None,allMail,l1))
    #print d2
    d1.update(d2)
    return d1

def main():
    for key in d1:
        print '迅雷帐号:',key,'密码:',d1[key]
    print '\033[01;32m==========Get {0} accounts!!===========\033[0m'.format(len(d1))

if soup.article.script == None:
    no_email()
    main()
else:
    has_email()
    main()

#!/usr/bin/env python

#coding:utf8

#Date:2016-05-08 13:03:39

from bs4 import BeautifulSoup

import urllib2,re

indexUrl = 'http://xlfans.com/archives/category/迅雷分享'

header = {'User-Agent':'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'}

def getHtml(url):

request = urllib2.Request(url,headers=header)

try:

response = urllib2.urlopen(request,timeout=10).read()

except Exception,e:

print 'time out'

exit()

else:

return response

#获取最新帐号链接

index_doc = getHtml(indexUrl)

indexSoup = BeautifulSoup(index_doc,'html.parser',from_encoding='utf-8')

newLinks = indexSoup.article.a

url = ''.join(re.findall(r'href="(.*)">',str(newLinks)))

#--------start-------------

html_doc = getHtml(url)

soup = BeautifulSoup(html_doc,'html.parser',from_encoding='utf-8')

d1 = {}

l1 = []

#解密邮箱函数

def decodeEmail(e):

de = ""

k = int(e[:2], 16)

for i in range(2, len(e)-1, 2):

de += chr(int(e[i:i+2], 16)^k)

return de

no_email_list = []

def no_email():

for p_content in soup.find_all('p'):

p_match = re.findall(r'迅雷粉迅雷会员账号.*',str(p_content.get_text().encode('utf-8')).replace(' ',''))

if p_match:no_email_list.append(''.join(p_match))

for list_item in no_email_list:

account = re.findall(r'迅雷粉迅雷会员账号(.*)密码?\W?.*',list_item)

psword = re.findall(r'迅雷粉迅雷会员账号.*密码?\W?(.*)',list_item)

d1[''.join(account)]=''.join(psword)

return d1

def has_email():

firstResult = soup.article.script.parent.get_text().encode('utf-8').replace(' ','').replace('：','')

#print firstResult

for i in re.findall(r'账号\w+密码\w+|迅雷粉迅雷会员账号.*',firstResult):

if not re.search(r'email',i):

acSplit = re.split(r'密',i)[0]

account = list(re.sub(r'\W+','',''.join(acSplit)))

pSplit = re.split(r'密',i)[1]

psword = list(re.sub(r'\W+','',''.join(pSplit)))

#print account,psword

d1[''.join(account)]=''.join(psword)

else:

ps2 = re.findall(r'迅雷粉迅雷会员账号.*密码?(.*)',i)

l1.append(''.join(ps2))

#print l1

allMail = [decodeEmail(mail['data-cfemail']) for mail in soup.find_all('a',class_='__cf_email__')]

d2 = dict(map(None,allMail,l1))

#print d2

d1.update(d2)

return d1

def main():

for key in d1:

print '迅雷帐号:',key,'密码:',d1[key]

print '\033[01;32m==========Get {0} accounts!!===========\033[0m'.format(len(d1))

if soup.article.script == None:

no_email()

main()

else:

has_email()

main()

Tags: python

小试牛刀Python抓取某站最新迅雷账号

热门文章

近期文章

文章归档

小试牛刀Python抓取某站最新迅雷账号

热门文章

近期文章

文章归档

标签