Logo Search packages:      
Sourcecode: harvestman version File versions  Download package

def HarvestMan::urlqueue::HarvestManCrawlerQueue::crawl (   self  ) 

Starts crawling for this project 

Definition at line 133 of file urlqueue.py.

00133                    :
        """ Starts crawling for this project """

        # Reset flag
        self._flag = 0
        
        if os.name=='nt':
            t1=time.clock()
        else:
            t1=time.time()

        # Set start time on config object
        self._configobj.starttime = t1

        if not self._configobj.urlserver:
            self.push(self._baseUrlObj, 'crawler')
        else:
            try:
                # Flush url server of any previous urls by
                # sending a flush command.
                send_url("flush", self._configobj.urlhost, self._configobj.urlport)
                send_url(str(self._baseUrlObj.index),
                         self._configobj.urlhost,
                         self._configobj.urlport)
            except:
                pass

        # Start harvestman controller thread
        # (New in 1.4)
        import datamgr
        
        self._controller = datamgr.harvestManController()
        self._controller.start()
            
        if self._configobj.fastmode:
            # Create the number of threads in the config file
            # Pre-launch the number of threads specified
            # in the config file.

            # Initialize thread dictionary
            self._basetracker.setDaemon(True)
            self._basetracker.start()

            # For simple downloads using nocrawl option
            # there is no need to start more than one
            # thread. The following one line is the core
            # of the nocrawl mode, apart from a few
            # changes in datamgr and config.
            if not self._configobj.nocrawl:
                while self._basetracker.get_status() != 0:
                    time.sleep(0.1)

                for x in range(1, self._configobj.maxtrackers):
                    
                    # Back to equality among threads
                    if x % 2==0:
                        t = crawler.HarvestManUrlFetcher(x, None)
                    else:
                        t = crawler.HarvestManUrlCrawler(x, None)
                    
                    self.add_tracker(t)
                    t.setDaemon(True)
                    t.start()

                for t in self._trackers:
                    
                    if t.get_role() == 'fetcher':
                        self._numfetchers += 1
                    elif t.get_role() == 'crawler':
                        self._numcrawlers += 1

                # bug: give the threads some time to start,
                # otherwise we exit immediately sometimes.
                time.sleep(2.0)

            self.mainloop()
            
            # Set flag to 1 to denote that downloading is finished.
            self._flag = 1
            
            self.stop_threads(noexit = True)
        else:
            self._basetracker.action()

    def get_base_tracker(self):


Generated by  Doxygen 1.6.0   Back to index