运行环境
开始之前你需要安装
•PHP 5.3.7 +
•MySQL
•Python 2.7 ~
•xunsearch 搜索引擎
获取源码
ssh 方式:
git clone git@github.com:k1995/BaiduyunSpider.git
https 方式:
git clone https://github.com/k1995/BaiduyunSpider
或手动下载
https://github.com/k1995/BaiduyunSpider/archive/master.zip
下载完毕后,项目的目录结构大致是这样的
— indexer/ #索引
— spider/ #爬虫
— sql/
— web/ #网站
— application/
— config/ # 配置相关
— config.php
— database.php # 数据库配置
…
…
— static/ # 存放静态资源,css|js|font
— system/
— index.php
…
开始部署
创建数据库
创建名为pan的数据库,编码设为utf-8。然后导入sql,完成表的创建。
网站部署
支持nginx,apache 服务器。
apache 需要开启 mod_rewrite 。
nginx 配置如下
location /
{
index index.php;
try_files $uri $uri/ /index.php/$uri;
}
location ~ [^/]\.php(/|$)
{
fastcgi_pass 127.0.0.1:9000;
fastcgi_index index.php;
include fastcgi.conf;
include pathinfo.conf;
}
配置文件修改
config.php 文件修改网站标题,描述等信息
database.php 修改数据库账号,密码等信息
网站是基于CodeIgniter 框架开发的,如安装,部署,或二次开发有问题,请参考官网文档
启动爬虫
进入 spider/目录,修改spider.py 中数据库信息。
如果你是第一次部署,需运行下面命令,完成做种
python spider.py –seed-user
上面其实就是抓取百度云热门分享用户的相关信息,然后从他们开始入手爬取数据
然后运行
python spider.py
此时爬虫已经开始工作了
安装xunsearch
目前使用xunsearch作为搜索引擎,后面会更换为elasticsearch。
安装过程请参考(不需要安装,PHP SDK,我已经整合到web里了)
http://xunsearch.com/doc/php/guide/start.installation
索引数据
上面我们完成了爬虫的数据抓取,网站的搭建,但还不能搜索,下面开始最后一步,索引的建立。
进入 indexer/目录,在indexer.php中将$prefix,替换为你web的根路径
require ‘$prefix/application/helpers/xs/lib/XS.php’;
并修改数据库账号密码
然后运行
./indexer.php
- import urllib2,re,argparse,json,time
- import MySQLdb as mdb
- import metautils,traceback,Queue,socket
- import random
- DB_HOST=‘127.0.0.1’
- DB_PORT=‘3306’
- DB_USER=‘root’
- DB_PASS=‘123123’
- DB_NAME=‘pan’
- SPIDER_INTERVAL=1
- ERR_NO=0
- ERR_REFUSE=1
- ERR_EX=2
- proxy_list = [
- {‘http’:“x.x.x.x:8080”},
- {‘http’:“x.x.x.x:8081”},
- {‘http’:“x.x.x.x:8082”},
- {‘http’:“x.x.x.x:8083”},
- {‘http’:“x.x.x.x:8084”},
- {‘http’:“x.x.x.x:8085”},
- {‘http’:“x.x.x.x:8086”},
- {‘http’:“x.x.x.x:8087”},
- {‘http’:“x.x.x.x:8088”},
- {‘http’:“x.x.x.x:8089”}
- ]
- def getHtml(url,ref=None,reget=5):
- try:
- uas = [
- “Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4”,
- “Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)”,
- “Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko”,
- “Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0”,
- “Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36”,
- “Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko”,
- ]
- proxy_ip =random.choice(proxy_list)
- ua=random.choice(uas)
- print proxy_ip
- print ua
- proxy_support = urllib2.ProxyHandler(proxy_ip)
- opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler)
- urllib2.install_opener(opener)
- request = urllib2.Request(url)
- time.sleep(5)
- request.add_header(‘User-Agent’, ua)
- if ref:
- request.add_header(‘Referer’,ref)
- page = urllib2.urlopen(request,timeout=30)
- html = page.read()
- except:
- if reget>=1:
-
- print ‘getHtml error,reget…%d’%(6-reget)
- time.sleep(20)
- return getHtml(url,ref,reget-1)
- else:
- print ‘request url:’+url
- print ‘failed to fetch html’
- exit()
- else:
- return html
- class Db(object):
- def __init__(self):
- self.dbconn=None
- self.dbcurr=None
- def check_conn(self):
- try:
- self.dbconn.ping()
- except:
- return False
- else:
- return True
- def conn(self):
- self.dbconn=mdb.connect(DB_HOST, DB_USER, DB_PASS,DB_NAME, charset=‘utf8’)
- self.dbconn.autocommit(False)
- self.dbcurr = self.dbconn.cursor()
- def fetchone(self):
- return self.dbcurr.fetchone()
- def fetchall(self):
- return self.dbcurr.fetchall()
- def execute(self, sql, args=None,falg=False):
- if not self.dbconn:
-
- self.conn()
- try:
- if args:
- rs=self.dbcurr.execute(sql,args)
- else:
- rs=self.dbcurr.execute(sql)
- return rs
- except Exception, e:
- if self.check_conn():
- print ‘execute error’
- traceback.print_exc()
- else:
- print ‘reconnect mysql’
- self.conn()
- if args:
- rs=self.dbcurr.execute(sql,args)
- else:
- rs=self.dbcurr.execute(sql)
- return rs
- def commit(self):
- self.dbconn.commit()
- def rollback(self):
- self.dbconn.rollback()
- def close(self):
- self.dbconn.close()
- self.dbcurr.close()
- def last_row_id(self):
- return self.dbcurr.lastrowid
- class BaiduPanSpider(object):
- def __init__(self):
- self.db=Db()
- self.files=[]
- self.got_files_count=0
- self.got_follow_count=0
- self.while_count=0
- self.spider_queue=Queue.Queue(maxsize=20)
- self.status=‘stop’
- self.errno=ERR_NO
- self.file_type_t={‘video’:0,‘image’:1,‘document’:2,‘music’:3,‘package’:4,‘software’:5,‘torrent’:6,‘other’:-1}
- def getShareUser(self,uk):
- url=‘http://pan.baidu.com/share/count?uk=%d&channel=chunlei&clienttype=0&web=1’%uk
- follows_json=json.loads(getHtml(url,uk))
- if follows_json[‘errno’]!=0:
- if follows_json[‘errno’]==-55:
- self.errno=ERR_REFUSE
- else:
- self.errno=ERR_EX
- return False
- return {
- ‘pubshare_cnt’:follows_json[‘pubshare_cnt’],
- ‘fans’:follows_json[‘fans’],
- ‘follow’:follows_json[‘follow’],
- ‘album’:follows_json[‘follows_json’]
- }
- def getHotUser(self):
- url=‘http://pan.baidu.com/pcloud/friend/gethotuserlist?type=1&from=feed&start=0&limit=24&channel=chunlei&clienttype=0&web=1’
- follows_json=json.loads(getHtml(url))
- if follows_json[‘errno’]!=0:
- print u‘failed to fetch hot users’
- return False
- returns=[]
- count=0
- for item in follows_json[‘hotuser_list’]:
- count=count+1
- hot_uname=item[‘hot_uname’].encode(‘utf-8’)
- hot_uk=item[‘hot_uk’]
- avatar_url=item[‘avatar_url’].encode(‘utf-8’)
- intro=item[‘intro’].encode(‘utf-8’)
- follow_count=item[‘follow_count’]
- fans_count=item[‘fans_count’]
- pubshare_count=item[‘pubshare_count’]
- album_count=item[‘album_count’]
- returns.append({‘hot_uname’:hot_uname,‘hot_uk’:hot_uk,‘avatar_url’:avatar_url,‘intro’:intro,‘follow_count’:follow_count,‘fans_count’:fans_count,‘pubshare_count’:pubshare_count,‘album_count’:album_count})
- if count==0:
- print “got no hot users”
- return False
- else:
- print “success to fetched hot users: %d”%count
- return returns
- def getFans(self,uk,start=0,limit=24):
-
-
-
- follows_url=‘http://pan.baidu.com/pcloud/friend/getfanslist?query_uk=%d&limit=%d&start=%d’%(uk,limit,start)
- follows_json=json.loads(getHtml(follows_url,uk))
- if follows_json[‘errno’]!=0:
- print u‘failed to fetch fens’
- return False
- total_count=follows_json[‘total_count’]
- returns=[]
- count=0
- for item in follows_json[‘fans_list’]:
- count=count+1
- fans_uname=item[‘fans_uname’].encode(‘utf-8’)
- fans_uk=item[‘fans_uk’]
- avatar_url=item[‘avatar_url’].encode(‘utf-8’)
- intro=item[‘intro’].encode(‘utf-8’)
- follow_count=item[‘follow_count’]
- fans_count=item[‘fans_count’]
- pubshare_count=item[‘pubshare_count’]
- album_count=item[‘album_count’]
- returns.append({‘fans_uname’:fans_uname,‘fans_uk’:fans_uk,‘avatar_url’:avatar_url,‘intro’:intro,‘follow_count’:follow_count,‘fans_count’:fans_count,‘pubshare_count’:pubshare_count,‘album_count’:album_count})
- return (total_count,count,returns)
- def getFollows(self,uk,start=0,limit=24):
- follows_url=‘http://pan.baidu.com/pcloud/friend/getfollowlist?query_uk=%d&limit=%d&start=%d&bdstoken=d82467db8b1f5741daf1d965d1509181&channel=chunlei&clienttype=0&web=1’%(uk,limit,start)
- ref=‘http://pan.baidu.com/pcloud/friendpage?type=follow&uk=%d&self=1’%uk
- follows_json=json.loads(getHtml(follows_url,ref))
- if follows_json[‘errno’]!=0:
- print ‘getFollows errno:%d’%follows_json[‘errno’]
- print ‘request_url:’+follows_url
- if follows_json[‘errno’]==-55:
- self.errno=ERR_REFUSE
- else:
- self.errno=ERR_EX
- return False
- total_count=follows_json[‘total_count’]
- returns=[]
- count=0
- if(total_count>0):
- for item in follows_json[‘follow_list’]:
- count=count+1
- returns.append({
- ‘follow_uname’:item[‘follow_uname’].encode(‘utf-8’),
- ‘follow_uk’:item[‘follow_uk’],
- ‘avatar_url’:item[‘avatar_url’].encode(‘utf-8’),
- ‘intro’:item[‘intro’].encode(‘utf-8’),
- ‘follow_count’:item[‘follow_count’],
- ‘fans_count’:item[‘fans_count’],
- ‘pubshare_count’:item[‘pubshare_count’],
- ‘album_count’:item[‘album_count’]
- })
- return (total_count,count,returns)
- def getShareLists(self,uk,start=0,limit=60):
- sharelists_url=‘http://pan.baidu.com/pcloud/feed/getsharelist?category=0&auth_type=1&request_location=share_home&start=%d&limit=%d&query_uk=%d&channel=chunlei&clienttype=0&web=1’%(start,limit,uk)
- ref=‘http://pan.baidu.com/share/home?uk=%d&view=share’%uk
- listhtm=getHtml(sharelists_url,ref)
- print(sharelists_url)
- sharelists_json=json.loads(listhtm)
- if(sharelists_json[‘errno’]!=0):
- print ‘getShareLists errno:%d’%sharelists_json[‘errno’]
- print ‘request_url:’+sharelists_url
- if sharelists_json[‘errno’]==-55:
- self.errno=ERR_REFUSE
- else:
- self.errno=ERR_EX
- return False
- total_count=sharelists_json[‘total_count’]
- returns=[]
- count=0
- if total_count>0:
- for item in sharelists_json[‘records’]:
- count=count+1
- feed_type=item[‘feed_type’]
- isdir=0
- size=0
- md5=”
- album_id=”
- shorturl=”
- if feed_type==‘share’:
- if item[‘filecount’]==1:
- filelist=item[‘filelist’]
- isdir=filelist[0][‘isdir’]
- size=filelist[0][‘size’]
- md5=filelist[0][‘md5’]
- else:
- isdir=1
- elif feed_type==‘album’:
- album_id=item[‘album_id’]
- isdir=2
- if item.has_key(‘shorturl’):
- shorturl=item[‘shorturl’]
- if item.has_key(‘username’):
- username=item[‘username’].encode(‘utf-8’)
- if feed_type==‘share’ or feed_type==‘album’:
- returns.append({
- ‘title’:item[‘title’].encode(‘utf-8’),
- ‘username’:username,
- ‘shorturl’:shorturl,
- ‘shareid’:item[‘source_id’],
- ‘feed_time’:item[‘feed_time’]//1000,
- ‘dCnt’:item[‘dCnt’],
- ‘isdir’:isdir,
- ‘size’:size,
- ‘md5’:md5,
- ‘uk’:uk,
- ‘feed_type’:feed_type
- })
- return (total_count,count,returns)
- def getAlbum(self,uk,start=0,limit=60):
- url=‘http://pan.baidu.com/pcloud/album/getlist?start=%d&limit=%d&query_uk=%d&channel=chunlei&clienttype=0&web=1&bdstoken=d82467db8b1f5741daf1d965d1509181’%(start,limit,uk)
- album_json=json.loads(getHtml(url,uk))
- total_count=album_json[‘count’]
- returns=[]
- count=0
- for item in album_json[‘album_list’]:
- count=count+1
- title=item[‘title’].encode(‘utf-8’)
- album_id=item[‘album_id’]
- create_time=item[‘create_time’]
- update_time=item[‘update_time’]
- filecount=item[‘filecount’]
- desc=item[‘desc’]
- returns.append({‘title’:title,‘album_id’:album_id,‘create_time’:create_time,‘desc’:desc,‘update_time’:update_time,‘filecount’:filecount,‘uk’:uk})
- if count==0:
- print “get nothing”
- return False
- else:
- print “success to fetched : %d”%count
- if (start+count)<total_count:
- start=start+limit
- returns=returns+self.getAlbum(uk,start)
- return returns
- def seedUsers(self):
- hot_usrs=self.getHotUser()
- if not hot_usrs:
- return
- try:
- for user in hot_usrs:
- time_stamp=int(time.time())
- if user[‘pubshare_count’]>0:
- self.db.execute(“INSERT INTO share_users (uk,user_name,avatar_url,intro,follow_count,album_count,\
- fens_count,pubshare_count,last_visited,create_time,weight) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)”,(
- user[‘hot_uk’],user[‘hot_uname’],user[‘avatar_url’],user[‘intro’],user[‘follow_count’],
- user[‘album_count’],user[‘fans_count’],user[‘pubshare_count’],time_stamp,time_stamp,5
- )
- )
- uid=self.db.last_row_id()
- self.db.execute(“INSERT INTO spider_list (uk,uid) VALUES(%s,%s)”,(user[‘hot_uk’],uid))
- except:
- traceback.print_exc()
- self.db.rollback()
- else:
- self.db.commit()
- def startSpider(self):
- if self.spider_queue.empty():
- fetched_users=self.db.execute(‘SELECT * from spider_list ORDER BY weight DESC limit 0,20’)
- if fetched_users<=0:
- print ‘nothing to spider,spider_list is empty’
- return False
- self.start=‘start’
- self.errno=ERR_NO
- fetchall=self.db.fetchall()
-
- for item in fetchall:
- self.spider_queue.put({
- ‘sid’:item[0],
- ‘uk’:item[1],
- ‘file_fetched’:item[2],
- ‘follow_fetched’:item[3],
- ‘follow_done’:item[4],
- ‘file_done’:item[5],
- ‘weight’:item[6],
- ‘uid’:item[7]
- })
- self.got_follow_count=0
- self.got_files_count=0
- self.while_count=0
- while not self.spider_queue.empty():
- self.while_count+=1
- share_user=self.spider_queue.get()
-
- if not share_user[‘file_done’]:
- print ‘%d now spidering file ,%d file fetched’%(share_user[‘uk’],share_user[‘file_fetched’])
- rs=self.getShareLists(share_user[‘uk’],share_user[‘file_fetched’])
-
- if not rs:
- print ‘uk:%d error to fetch files,try again later…’%share_user[‘uk’]
- return True
- total_count,fetched_count,file_list=rs
- total_fetched=share_user[‘file_fetched’]+fetched_count
- print ‘fetched_file_count:%d’%fetched_count
- if total_fetched>=total_count or total_count==0:
- share_user[‘file_done’]=1
- if total_count==0:
- self.db.execute(“UPDATE spider_list set file_done=%s WHERE sid=%s”,(1,share_user[‘sid’]))
- self.db.commit()
- else:
- try:
- files_count=0
- for file in file_list:
- files_count+=1
- ext=”
- file_type=”
- file_type_i=-1
- if file[‘isdir’]==0 and file[‘feed_type’]==‘share’:
- ext = metautils.get_extension(file[‘title’]).lower()
- file_type = metautils.get_category(ext)
- file_type_i=self.file_type_t[file_type]
- time_stamp=int(time.time())
- self.db.execute(“INSERT INTO share_file (title,uk,user_name,shareid,shorturl,isdir,size,md5,ext,feed_time,create_time,file_type,uid,feed_type) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)”,(file[‘title’],file[‘uk’],file[‘username’],file[‘shareid’], file[‘shorturl’],file[‘isdir’],file[‘size’],file[‘md5’],ext,file[‘feed_time’],time_stamp,file_type_i,share_user[‘uid’],file[‘feed_type’])
- )
- except:
- share_user[‘file_done’]=0
- self.db.rollback()
- traceback.print_exc()
- return False
- else:
- self.db.execute(“UPDATE spider_list set file_fetched=%s,file_done=%s WHERE sid=%s”,(total_fetched,share_user[‘file_done’],share_user[‘sid’]))
- self.db.execute(“UPDATE share_users set fetched=%s WHERE uid=%s”,(total_fetched,share_user[‘uid’]))
- share_user[‘file_fetched’]=total_fetched
- self.got_files_count+=files_count
- self.db.commit()
-
- if share_user[‘follow_done’]==0 and share_user[‘file_done’]==1:
- print ‘%d now spidering follow ,%d follow fetched’%(share_user[‘uk’],share_user[‘follow_fetched’])
- rs=self.getFollows(share_user[‘uk’],share_user[‘follow_fetched’])
- if not rs:
- print ‘error to fetch follows,try again later…’
- return
- total_count,fetched_count,follow_list=rs
- total_fetched=share_user[‘follow_fetched’]+fetched_count
- print ‘fetched_follow_count:%d’%fetched_count
- if total_fetched>=total_count or total_count==0:
- share_user[‘follow_done’]=1
- if total_count==0:
- self.db.execute(“DELETE FROM spider_list WHERE sid=%s”,(share_user[‘sid’],))
- self.db.commit()
- else:
- try:
- follow_count=0
- for follow in follow_list:
- follow_count+=1
-
- if self.db.execute(‘SELECT * FROM share_users WHERE uk=%s’,(follow[‘follow_uk’],))>0:
- print ‘uk:%d has already in share_user table’%follow[‘follow_uk’]
- continue
- time_stamp=int(time.time())
- self.db.execute(“INSERT INTO share_users (uk,user_name,avatar_url,intro,follow_count,album_count,\
- fens_count,pubshare_count,last_visited,create_time,weight) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)”,(
- follow[‘follow_uk’],follow[‘follow_uname’],follow[‘avatar_url’],follow[‘intro’],follow[‘follow_count’],
- follow[‘album_count’],follow[‘fans_count’],follow[‘pubshare_count’],time_stamp,time_stamp,5
- )
- )
-
- self.db.execute(“INSERT INTO spider_list (uk,uid) VALUES(%s,%s)”,(follow[‘follow_uk’],self.db.last_row_id()))
- except:
- share_user[‘follow_done’]=0
- self.db.rollback()
- traceback.print_exc()
- return False
- else:
- if share_user[‘follow_done’]==1:
-
- print ‘delete follow fetched sid:%d from spider_list’%share_user[‘sid’]
- self.db.execute(“DELETE FROM spider_list WHERE sid=%s”,(share_user[‘sid’],))
- else:
- self.db.execute(“UPDATE spider_list set follow_fetched=%s,follow_done=%s WHERE sid=%s”,(total_fetched,share_user[‘follow_done’],share_user[‘sid’]))
- share_user[‘follow_fetched’]=total_fetched
- self.got_follow_count+=follow_count
- self.db.commit()
-
- if share_user[‘follow_done’]==0:
- self.spider_queue.put(share_user)
- else:
- print ‘%d has done’%share_user[‘uk’]
- del share_user
- time.sleep(SPIDER_INTERVAL)
- print ‘—————–Done——————‘
- print ‘while_count:%d’%self.while_count
- print ‘got_follow_count:%d’%self.got_follow_count
- print ‘got_files_count:%d’%self.got_files_count
- return True
- def stop(self):
- pass
- if __name__ == “__main__”:
- parser = argparse.ArgumentParser()
- parser.add_argument(“–seed-user”, help=“get seed user”, action=“store_true”)
- args = parser.parse_args()
- spider=BaiduPanSpider()
-
- if args.seed_user:
- spider.seedUsers()
- else:
- while(1):
- print ‘start spider…’
- result=spider.startSpider()
- if not result:
- print ‘The spider is refused,5 mins later try again auto…’
- time.sleep(60*5)
- else:
- print ‘one worker queue id done’
- time.sleep(1)