Python搜索引擎安装教程

运行环境
开始之前你需要安装

•PHP 5.3.7 +
•MySQL
•Python 2.7 ~
•xunsearch 搜索引擎
获取源码
ssh 方式:

git clone git@github.com:k1995/BaiduyunSpider.git
https 方式:

git clone https://github.com/k1995/BaiduyunSpider
或手动下载

https://github.com/k1995/BaiduyunSpider/archive/master.zip
下载完毕后,项目的目录结构大致是这样的

— indexer/  #索引
— spider/   #爬虫
— sql/
— web/      #网站
— application/
— config/ # 配置相关
— config.php
— database.php # 数据库配置


— static/ # 存放静态资源,css|js|font
— system/
— index.php

开始部署
创建数据库
创建名为pan的数据库,编码设为utf-8。然后导入sql,完成表的创建。

网站部署
支持nginx,apache 服务器。

apache 需要开启 mod_rewrite 。

nginx  配置如下

location /
{
index index.php;
try_files $uri $uri/ /index.php/$uri;
}

location ~ [^/]\.php(/|$)
{
fastcgi_pass  127.0.0.1:9000;
fastcgi_index index.php;
include fastcgi.conf;
include pathinfo.conf;
}
配置文件修改
config.php 文件修改网站标题,描述等信息

database.php 修改数据库账号,密码等信息

网站是基于CodeIgniter 框架开发的,如安装,部署,或二次开发有问题,请参考官网文档
启动爬虫
进入 spider/目录,修改spider.py 中数据库信息。

如果你是第一次部署,需运行下面命令,完成做种

python spider.py –seed-user
上面其实就是抓取百度云热门分享用户的相关信息,然后从他们开始入手爬取数据

然后运行

python spider.py
此时爬虫已经开始工作了

安装xunsearch
目前使用xunsearch作为搜索引擎,后面会更换为elasticsearch。

安装过程请参考(不需要安装,PHP SDK,我已经整合到web里了)

http://xunsearch.com/doc/php/guide/start.installation

索引数据
上面我们完成了爬虫的数据抓取,网站的搭建,但还不能搜索,下面开始最后一步,索引的建立。

进入 indexer/目录,在indexer.php中将$prefix,替换为你web的根路径

require ‘$prefix/application/helpers/xs/lib/XS.php’;
并修改数据库账号密码

然后运行

./indexer.php

 

  1. # -*- coding: utf-8 -*-
  2. import urllib2,re,argparse,json,time
  3. import MySQLdb as mdb
  4. import metautils,traceback,Queue,socket
  5. import random
  6. “””
  7. /*
  8.  *——————————————–   
  9.  *
  10.  *  
  11.  *    
  12.  *  Github 仓库: https://github.com/k1995/BaiduyunSpider
  13.  * 
  14.  *  演示:http://www.11bt.net/ *
  15.  *   
  16.  * —————————————-*/
  17. “””
  18. DB_HOST=‘127.0.0.1’
  19. DB_PORT=‘3306’
  20. DB_USER=‘root’
  21. # MySQL密码
  22. DB_PASS=‘123123’
  23. # 数据库名称
  24. DB_NAME=‘pan’
  25. SPIDER_INTERVAL=1
  26. ERR_NO=0#正常
  27. ERR_REFUSE=1#爬虫爬取速度过快,被拒绝
  28. ERR_EX=2#未知错误
  29. proxy_list = [
  30.                {‘http’:“x.x.x.x:8080”},
  31.                {‘http’:“x.x.x.x:8081”},
  32.                {‘http’:“x.x.x.x:8082”},
  33.                {‘http’:“x.x.x.x:8083”},
  34.                {‘http’:“x.x.x.x:8084”},
  35.                {‘http’:“x.x.x.x:8085”},
  36.                {‘http’:“x.x.x.x:8086”},
  37.                {‘http’:“x.x.x.x:8087”},
  38.                {‘http’:“x.x.x.x:8088”},
  39.                {‘http’:“x.x.x.x:8089”}
  40.                 ]
  41. def getHtml(url,ref=None,reget=5):
  42.  try:
  43.   uas = [
  44.     “Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4”,
  45.     “Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)”,
  46.     “Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko”,
  47.     “Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0”,
  48.     “Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36”,
  49.     “Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko”,
  50.     ]
  51.   proxy_ip =random.choice(proxy_list)
  52.   ua=random.choice(uas)
  53.   print proxy_ip
  54.   print ua
  55.   proxy_support = urllib2.ProxyHandler(proxy_ip)
  56.   opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler)
  57.   urllib2.install_opener(opener)
  58.   request = urllib2.Request(url)
  59.   time.sleep(5)
  60.   request.add_header(‘User-Agent’, ua)
  61.   if ref:
  62.    request.add_header(‘Referer’,ref)
  63.   page = urllib2.urlopen(request,timeout=30)
  64.   html = page.read()
  65.  except:
  66.   if reget>=1:
  67.    #如果getHtml失败,则再次尝试5次
  68.    print ‘getHtml error,reget…%d’%(6-reget)
  69.    time.sleep(20)
  70.    return getHtml(url,ref,reget-1)
  71.   else:
  72.    print ‘request url:’+url
  73.    print ‘failed to fetch html’
  74.    exit()
  75.  else:
  76.   return html
  77. class Db(object):
  78.  def __init__(self):
  79.   self.dbconn=None
  80.   self.dbcurr=None
  81.  def check_conn(self):
  82.   try:
  83.    self.dbconn.ping()
  84.   except:
  85.    return False
  86.   else:
  87.    return True
  88.  def conn(self):
  89.   self.dbconn=mdb.connect(DB_HOST, DB_USER, DB_PASS,DB_NAME, charset=‘utf8’)
  90.   self.dbconn.autocommit(False)
  91.   self.dbcurr = self.dbconn.cursor()
  92.  def fetchone(self):
  93.   return self.dbcurr.fetchone()
  94.  def fetchall(self):
  95.   return self.dbcurr.fetchall()
  96.  def execute(self, sql, args=None,falg=False):
  97.   if not self.dbconn:
  98.    #第一次链接数据库
  99.    self.conn()
  100.   try:
  101.    if args:
  102.     rs=self.dbcurr.execute(sql,args)
  103.    else:
  104.     rs=self.dbcurr.execute(sql)
  105.    return rs
  106.   except Exception, e:
  107.    if self.check_conn():
  108.     print ‘execute error’
  109.     traceback.print_exc()
  110.    else:
  111.     print ‘reconnect mysql’
  112.     self.conn()
  113.           if args:
  114.               rs=self.dbcurr.execute(sql,args)
  115.           else:
  116.               rs=self.dbcurr.execute(sql)
  117.           return rs
  118.  def commit(self):
  119.   self.dbconn.commit()
  120.  def rollback(self):
  121.   self.dbconn.rollback()
  122.  def close(self):
  123.   self.dbconn.close()
  124.   self.dbcurr.close()
  125.  def last_row_id(self):
  126.   return self.dbcurr.lastrowid
  127. class BaiduPanSpider(object):
  128.  def __init__(self):
  129.   self.db=Db()
  130.   self.files=[]
  131.   self.got_files_count=0
  132.   self.got_follow_count=0
  133.   self.while_count=0
  134.   self.spider_queue=Queue.Queue(maxsize=20)
  135.   self.status=‘stop’
  136.   self.errno=ERR_NO
  137.   self.file_type_t={‘video’:0,‘image’:1,‘document’:2,‘music’:3,‘package’:4,‘software’:5,‘torrent’:6,‘other’:-1}
  138.  def getShareUser(self,uk):
  139.   url=‘http://pan.baidu.com/share/count?uk=%d&channel=chunlei&clienttype=0&web=1’%uk
  140.   follows_json=json.loads(getHtml(url,uk))
  141.   if follows_json[‘errno’]!=0:
  142.    if follows_json[‘errno’]==-55:
  143.     self.errno=ERR_REFUSE
  144.    else:
  145.     self.errno=ERR_EX
  146.    return False
  147.   return {
  148.    ‘pubshare_cnt’:follows_json[‘pubshare_cnt’],
  149.    ‘fans’:follows_json[‘fans’],
  150.    ‘follow’:follows_json[‘follow’],
  151.    ‘album’:follows_json[‘follows_json’]
  152.   }
  153.  def getHotUser(self):
  154.   url=‘http://pan.baidu.com/pcloud/friend/gethotuserlist?type=1&from=feed&start=0&limit=24&channel=chunlei&clienttype=0&web=1’
  155.   follows_json=json.loads(getHtml(url))
  156.   if follows_json[‘errno’]!=0:
  157.    print u‘failed to fetch hot users’
  158.    return False
  159.   returns=[]
  160.   count=0
  161.   for item in follows_json[‘hotuser_list’]:
  162.    count=count+1
  163.    hot_uname=item[‘hot_uname’].encode(‘utf-8’)
  164.    hot_uk=item[‘hot_uk’]
  165.    avatar_url=item[‘avatar_url’].encode(‘utf-8’)
  166.    intro=item[‘intro’].encode(‘utf-8’)
  167.    follow_count=item[‘follow_count’]
  168.    fans_count=item[‘fans_count’]
  169.    pubshare_count=item[‘pubshare_count’]
  170.    album_count=item[‘album_count’]
  171.    returns.append({‘hot_uname’:hot_uname,‘hot_uk’:hot_uk,‘avatar_url’:avatar_url,‘intro’:intro,‘follow_count’:follow_count,‘fans_count’:fans_count,‘pubshare_count’:pubshare_count,‘album_count’:album_count})
  172.   if count==0:
  173.    print “got no hot users”
  174.    return False
  175.   else:
  176.    print “success to fetched hot users: %d”%count
  177.   return returns
  178.  def getFans(self,uk,start=0,limit=24):
  179.   #query_uk:用户ID
  180.   #limit:每一页最多显示数量
  181.   #start:当前页数
  182.   follows_url=‘http://pan.baidu.com/pcloud/friend/getfanslist?query_uk=%d&limit=%d&start=%d’%(uk,limit,start)
  183.   follows_json=json.loads(getHtml(follows_url,uk))
  184.   if follows_json[‘errno’]!=0:
  185.    print u‘failed to fetch fens’
  186.    return False
  187.   total_count=follows_json[‘total_count’]
  188.   returns=[]
  189.   count=0
  190.   for item in follows_json[‘fans_list’]:
  191.    count=count+1
  192.    fans_uname=item[‘fans_uname’].encode(‘utf-8’)
  193.    fans_uk=item[‘fans_uk’]
  194.    avatar_url=item[‘avatar_url’].encode(‘utf-8’)
  195.    intro=item[‘intro’].encode(‘utf-8’)
  196.    follow_count=item[‘follow_count’]
  197.    fans_count=item[‘fans_count’]
  198.    pubshare_count=item[‘pubshare_count’]
  199.    album_count=item[‘album_count’]
  200.    returns.append({‘fans_uname’:fans_uname,‘fans_uk’:fans_uk,‘avatar_url’:avatar_url,‘intro’:intro,‘follow_count’:follow_count,‘fans_count’:fans_count,‘pubshare_count’:pubshare_count,‘album_count’:album_count})
  201.   return (total_count,count,returns)
  202.  def getFollows(self,uk,start=0,limit=24):
  203.   follows_url=‘http://pan.baidu.com/pcloud/friend/getfollowlist?query_uk=%d&limit=%d&start=%d&bdstoken=d82467db8b1f5741daf1d965d1509181&channel=chunlei&clienttype=0&web=1’%(uk,limit,start)
  204.   ref=‘http://pan.baidu.com/pcloud/friendpage?type=follow&uk=%d&self=1’%uk
  205.   follows_json=json.loads(getHtml(follows_url,ref))
  206.   if follows_json[‘errno’]!=0:
  207.    print ‘getFollows errno:%d’%follows_json[‘errno’]
  208.    print ‘request_url:’+follows_url
  209.    if follows_json[‘errno’]==-55:
  210.     self.errno=ERR_REFUSE
  211.    else:
  212.     self.errno=ERR_EX
  213.    return False
  214.   total_count=follows_json[‘total_count’]
  215.   returns=[]
  216.   count=0
  217.   if(total_count>0):
  218.    for item in follows_json[‘follow_list’]:
  219.     count=count+1
  220.     returns.append({
  221.      ‘follow_uname’:item[‘follow_uname’].encode(‘utf-8’),
  222.      ‘follow_uk’:item[‘follow_uk’],
  223.      ‘avatar_url’:item[‘avatar_url’].encode(‘utf-8’),
  224.      ‘intro’:item[‘intro’].encode(‘utf-8’),
  225.      ‘follow_count’:item[‘follow_count’],
  226.      ‘fans_count’:item[‘fans_count’],
  227.      ‘pubshare_count’:item[‘pubshare_count’],
  228.      ‘album_count’:item[‘album_count’]
  229.     })
  230.   return (total_count,count,returns)
  231.  def getShareLists(self,uk,start=0,limit=60):
  232.   sharelists_url=‘http://pan.baidu.com/pcloud/feed/getsharelist?category=0&auth_type=1&request_location=share_home&start=%d&limit=%d&query_uk=%d&channel=chunlei&clienttype=0&web=1’%(start,limit,uk)
  233.   ref=‘http://pan.baidu.com/share/home?uk=%d&view=share’%uk
  234.   listhtm=getHtml(sharelists_url,ref)
  235.   print(sharelists_url)
  236.   sharelists_json=json.loads(listhtm)
  237.   if(sharelists_json[‘errno’]!=0):
  238.    print ‘getShareLists errno:%d’%sharelists_json[‘errno’]
  239.    print ‘request_url:’+sharelists_url
  240.    if sharelists_json[‘errno’]==-55:
  241.     self.errno=ERR_REFUSE
  242.    else:
  243.     self.errno=ERR_EX
  244.    return False
  245.   total_count=sharelists_json[‘total_count’]
  246.   returns=[]
  247.   count=0
  248.   if total_count>0:
  249.    for item in sharelists_json[‘records’]:
  250.     count=count+1
  251.     feed_type=item[‘feed_type’]
  252.     isdir=0
  253.     size=0
  254.     md5=
  255.     album_id=
  256.     shorturl=
  257.     if feed_type==‘share’:
  258.      if item[‘filecount’]==1:
  259.       filelist=item[‘filelist’]
  260.       isdir=filelist[0][‘isdir’]
  261.       size=filelist[0][‘size’]
  262.       md5=filelist[0][‘md5’]
  263.      else:
  264.       isdir=1
  265.     elif feed_type==‘album’:
  266.      album_id=item[‘album_id’]
  267.      isdir=2
  268.     if item.has_key(‘shorturl’):
  269.      shorturl=item[‘shorturl’]
  270.     if item.has_key(‘username’):
  271.      username=item[‘username’].encode(‘utf-8’)
  272.     if feed_type==‘share’ or feed_type==‘album’:
  273.      returns.append({
  274.       ‘title’:item[‘title’].encode(‘utf-8’),
  275.       ‘username’:username,
  276.       ‘shorturl’:shorturl,
  277.       ‘shareid’:item[‘source_id’],
  278.       ‘feed_time’:item[‘feed_time’]//1000,#分享时间
  279.       ‘dCnt’:item[‘dCnt’],
  280.       ‘isdir’:isdir,
  281.       ‘size’:size,
  282.       ‘md5’:md5,
  283.       ‘uk’:uk,
  284.       ‘feed_type’:feed_type
  285.      })
  286.   return (total_count,count,returns)
  287.  def getAlbum(self,uk,start=0,limit=60):
  288.   url=‘http://pan.baidu.com/pcloud/album/getlist?start=%d&limit=%d&query_uk=%d&channel=chunlei&clienttype=0&web=1&bdstoken=d82467db8b1f5741daf1d965d1509181’%(start,limit,uk)
  289.   album_json=json.loads(getHtml(url,uk))
  290.   total_count=album_json[‘count’]
  291.   returns=[]
  292.   count=0
  293.   for item in album_json[‘album_list’]:
  294.    count=count+1
  295.    title=item[‘title’].encode(‘utf-8’)
  296.    album_id=item[‘album_id’]
  297.    create_time=item[‘create_time’]
  298.    update_time=item[‘update_time’]
  299.    filecount=item[‘filecount’]
  300.    desc=item[‘desc’]
  301.    returns.append({‘title’:title,‘album_id’:album_id,‘create_time’:create_time,‘desc’:desc,‘update_time’:update_time,‘filecount’:filecount,‘uk’:uk})
  302.   if count==0:
  303.    print “get nothing”
  304.    return False
  305.   else:
  306.    print “success to fetched : %d”%count
  307.   if (start+count)<total_count:
  308.    start=start+limit
  309.    returns=returns+self.getAlbum(uk,start)
  310.   return returns
  311.  def seedUsers(self):
  312.   hot_usrs=self.getHotUser()
  313.   if not hot_usrs:
  314.    return
  315.   try:
  316.    for user in hot_usrs:
  317.     time_stamp=int(time.time())
  318.     if user[‘pubshare_count’]>0:
  319.      self.db.execute(“INSERT INTO share_users (uk,user_name,avatar_url,intro,follow_count,album_count,\
  320.       fens_count,pubshare_count,last_visited,create_time,weight) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)”,(
  321.        user[‘hot_uk’],user[‘hot_uname’],user[‘avatar_url’],user[‘intro’],user[‘follow_count’],
  322.        user[‘album_count’],user[‘fans_count’],user[‘pubshare_count’],time_stamp,time_stamp,5
  323.       )
  324.      )
  325.      uid=self.db.last_row_id()
  326.      self.db.execute(“INSERT INTO spider_list (uk,uid) VALUES(%s,%s)”,(user[‘hot_uk’],uid))
  327.   except:
  328.    traceback.print_exc()
  329.    self.db.rollback()
  330.   else:
  331.    self.db.commit()
  332.  def startSpider(self):
  333.   if self.spider_queue.empty():
  334.    fetched_users=self.db.execute(‘SELECT * from spider_list ORDER BY weight DESC limit 0,20’)
  335.    if fetched_users<=0:
  336.     print ‘nothing to spider,spider_list is empty’
  337.     return False
  338.    self.start=‘start’
  339.    self.errno=ERR_NO
  340.    fetchall=self.db.fetchall()
  341.    #将数据库中取出的待爬取的分享者,加入爬取队列
  342.    for item in fetchall:
  343.     self.spider_queue.put({
  344.     ‘sid’:item[0],
  345.     ‘uk’:item[1],
  346.     ‘file_fetched’:item[2],
  347.     ‘follow_fetched’:item[3],
  348.     ‘follow_done’:item[4],
  349.     ‘file_done’:item[5],
  350.     ‘weight’:item[6],
  351.     ‘uid’:item[7]
  352.    })
  353.    self.got_follow_count=0
  354.    self.got_files_count=0
  355.    self.while_count=0
  356.   while not self.spider_queue.empty():
  357.    self.while_count+=1
  358.    share_user=self.spider_queue.get()
  359.    #爬取分享者的文件列表
  360.    if not share_user[‘file_done’]:
  361.     print ‘%d now spidering file ,%d  file fetched’%(share_user[‘uk’],share_user[‘file_fetched’])
  362.     rs=self.getShareLists(share_user[‘uk’],share_user[‘file_fetched’])
  363.     #print(rs)
  364.     if not rs:
  365.      print ‘uk:%d error to fetch files,try again later…’%share_user[‘uk’]
  366.      return True
  367.     total_count,fetched_count,file_list=rs
  368.     total_fetched=share_user[‘file_fetched’]+fetched_count
  369.     print ‘fetched_file_count:%d’%fetched_count
  370.     if total_fetched>=total_count or total_count==0:
  371.      share_user[‘file_done’]=1#该分享者所有文件爬取完成
  372.     if total_count==0:
  373.      self.db.execute(“UPDATE spider_list set file_done=%s WHERE sid=%s”,(1,share_user[‘sid’]))
  374.      self.db.commit()
  375.     else:
  376.      try:
  377.       files_count=0
  378.       for file in file_list:
  379.        files_count+=1
  380.        ext=
  381.        file_type=
  382.        file_type_i=-1
  383.        if file[‘isdir’]==0 and file[‘feed_type’]==‘share’:
  384.         ext = metautils.get_extension(file[‘title’]).lower()
  385.         file_type = metautils.get_category(ext)
  386.         file_type_i=self.file_type_t[file_type]
  387.        time_stamp=int(time.time())
  388.        self.db.execute(“INSERT INTO share_file (title,uk,user_name,shareid,shorturl,isdir,size,md5,ext,feed_time,create_time,file_type,uid,feed_type) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)”,(file[‘title’],file[‘uk’],file[‘username’],file[‘shareid’],         file[‘shorturl’],file[‘isdir’],file[‘size’],file[‘md5’],ext,file[‘feed_time’],time_stamp,file_type_i,share_user[‘uid’],file[‘feed_type’])
  389.        )
  390.      except:
  391.       share_user[‘file_done’]=0
  392.       self.db.rollback()
  393.       traceback.print_exc()
  394.       return False
  395.      else:
  396.       self.db.execute(“UPDATE spider_list set file_fetched=%s,file_done=%s WHERE sid=%s”,(total_fetched,share_user[‘file_done’],share_user[‘sid’]))
  397.       self.db.execute(“UPDATE share_users set fetched=%s WHERE uid=%s”,(total_fetched,share_user[‘uid’]))
  398.       share_user[‘file_fetched’]=total_fetched
  399.       self.got_files_count+=files_count
  400.       self.db.commit()
  401.    #爬取完文件后在爬取订阅列表
  402.    if share_user[‘follow_done’]==0 and share_user[‘file_done’]==1:
  403.     print ‘%d now spidering follow ,%d  follow fetched’%(share_user[‘uk’],share_user[‘follow_fetched’])
  404.     rs=self.getFollows(share_user[‘uk’],share_user[‘follow_fetched’])
  405.     if not rs:
  406.      print ‘error to fetch follows,try again later…’
  407.      return
  408.     total_count,fetched_count,follow_list=rs
  409.     total_fetched=share_user[‘follow_fetched’]+fetched_count
  410.     print ‘fetched_follow_count:%d’%fetched_count
  411.     if total_fetched>=total_count or total_count==0:
  412.      share_user[‘follow_done’]=1
  413.     if total_count==0:
  414.      self.db.execute(“DELETE FROM spider_list WHERE sid=%s”,(share_user[‘sid’],))
  415.      self.db.commit()
  416.     else:
  417.      try:
  418.       follow_count=0
  419.       for follow in follow_list:
  420.        follow_count+=1
  421.        #判断该用户是否已经在表中了
  422.        if self.db.execute(‘SELECT * FROM share_users WHERE uk=%s’,(follow[‘follow_uk’],))>0:
  423.         print ‘uk:%d has already in share_user table’%follow[‘follow_uk’]
  424.         continue
  425.        time_stamp=int(time.time())
  426.        self.db.execute(“INSERT INTO share_users (uk,user_name,avatar_url,intro,follow_count,album_count,\
  427.         fens_count,pubshare_count,last_visited,create_time,weight) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)”,(
  428.          follow[‘follow_uk’],follow[‘follow_uname’],follow[‘avatar_url’],follow[‘intro’],follow[‘follow_count’],
  429.          follow[‘album_count’],follow[‘fans_count’],follow[‘pubshare_count’],time_stamp,time_stamp,5
  430.         )
  431.        )
  432.        #将获取的新分享者加入爬取列表
  433.        self.db.execute(“INSERT INTO spider_list (uk,uid) VALUES(%s,%s)”,(follow[‘follow_uk’],self.db.last_row_id()))
  434.      except:
  435.       share_user[‘follow_done’]=0
  436.       self.db.rollback()
  437.       traceback.print_exc()
  438.       return False
  439.      else:
  440.       if share_user[‘follow_done’]==1:
  441.        #订阅者爬取完成,该分享者的任务完成,从待爬取列表中删除
  442.        print ‘delete follow fetched sid:%d from spider_list’%share_user[‘sid’]
  443.        self.db.execute(“DELETE FROM spider_list WHERE sid=%s”,(share_user[‘sid’],))
  444.       else:
  445.        self.db.execute(“UPDATE spider_list set follow_fetched=%s,follow_done=%s WHERE sid=%s”,(total_fetched,share_user[‘follow_done’],share_user[‘sid’]))
  446.       share_user[‘follow_fetched’]=total_fetched
  447.       self.got_follow_count+=follow_count
  448.       self.db.commit()
  449.    #只要分享者列表没完成,说明该分享者还未爬取完,则加入工作队列,继续爬取
  450.    if share_user[‘follow_done’]==0:
  451.     self.spider_queue.put(share_user)
  452.    else:
  453.     print ‘%d has done’%share_user[‘uk’]
  454.     del share_user
  455.    time.sleep(SPIDER_INTERVAL)
  456.   print ‘—————–Done——————‘
  457.   print ‘while_count:%d’%self.while_count
  458.   print ‘got_follow_count:%d’%self.got_follow_count
  459.   print ‘got_files_count:%d’%self.got_files_count
  460.   return True
  461.  def stop(self):
  462.   pass
  463. if __name__ == “__main__”:
  464.  parser = argparse.ArgumentParser()
  465.  parser.add_argument(“–seed-user”, help=“get seed user”, action=“store_true”)
  466.  args = parser.parse_args()
  467.  spider=BaiduPanSpider()
  468.  # 做种
  469.  if args.seed_user:
  470.   spider.seedUsers()
  471.  else:
  472.   while(1):
  473.    print ‘start spider…’
  474.    result=spider.startSpider()
  475.    if not result:
  476.     print ‘The spider is refused,5 mins later try again auto…’
  477.     time.sleep(60*5)
  478.    else:
  479.     print ‘one worker queue id done’
  480.     time.sleep(1)


发表评论

电子邮件地址不会被公开。 必填项已用*标注