Logo Search packages:      
Sourcecode: harvestman version File versions  Download package

def HarvestMan::crawler::HarvestManUrlFetcher::process_url (   self  ) 

This function downloads the data for a url and writes its files.
It also posts the data for web pages to a data queue 

Reimplemented from HarvestMan::crawler::HarvestManBaseUrlCrawler.

Reimplemented in HarvestMan::crawler::HarvestManUrlDownloader.

Definition at line 532 of file crawler.py.

00532                          :
        """ This function downloads the data for a url and writes its files.
        It also posts the data for web pages to a data queue """

        mgr = GetObject('datamanager')            
        moreinfo('Downloading file for url', self._url)
        data = mgr.download_url(self._urlobject)

        # Add webpage links in datamgr, if we managed to
        # download the url

        if self._urlobject.is_webpage() and data:
            # MOD: Need to localise <base href="..." links if any
            # so add a NULL entry. (Nov 30 2004 - Refer header)
            mgr.update_links(self._urlobject.get_full_filename(), [])
            url_obj = self._urlobject
            
            self._status = 2
            
            extrainfo("Parsing web page", self._url)

            try:
                self.wp.reset()
                self.wp.feed(data)
                # Bug Fix: If the <base href="..."> tag was defined in the
                # web page, relative urls must be constructed against
                # the url provided in <base href="...">
                
                if self.wp.base_url_defined():
                    url = self.wp.get_base_url()
                    if not self._urlobject.is_equal(url):
                        extrainfo("Base url defined, replacing",self._url)
                        # Construct a url object
                        url_obj = urlparser.HarvestManUrlParser(url,
                                                                'base',
                                                                0,
                                                                self._urlobject,
                                                                self._configobj.projdir)
                        url_obj.set_index()
                        SetUrlObject(url_obj)
                        # Save a reference otherwise
                        # proxy might be deleted
                        self._tempobj = url_obj

                self.wp.close()
            except (SGMLParseError, IOError), e:
                debug(str(e))

            links = self.wp.links
            # Put images first!
            if self._configobj.images:
                links += self.wp.images
            
            # Fix for hanging threads - Append objects
            # to local buffer if queue was full.

            # Mod in 1.4.1 - Make url objects right here
            # and push them... this fixes bugs in localising
            # links, since we call update_links here and
            # not later.

            urlobjlist = []

            # Rules checker object
            ruleschecker = GetObject('ruleschecker')
            
            for typ, url in links:
                is_cgi, is_php = False, False
                
                if url.find('php?') != -1: is_php = True
                if typ == 'form' or is_php: is_cgi = True

                if not url: continue

                try:
                    child_urlobj = urlparser.HarvestManUrlParser(url,
                                                                 typ,
                                                                 is_cgi,
                                                                 url_obj)

                    urlobjlist.append(child_urlobj)
                except urlparser.HarvestManUrlParserError:
                    continue
            
            if not self._crawlerqueue.push((url_obj, urlobjlist), 'fetcher'):
                if not self._configobj.blocking:                
                    self.buffer.append((url_obj, urlobjlist))
            else:
                pass
                    
            # Update links called here
            mgr.update_links(url_obj.get_full_filename(), urlobjlist)


Generated by  Doxygen 1.6.0   Back to index