Logo Search packages:      
Sourcecode: harvestman version File versions  Download package

def HarvestMan::rules::harvestManRulesChecker::__apply_rep (   self,
  urlObj 
) [private]

See if the robots.txt file on the server
allows fetching of this url. Return 0 on success
(fetching allowed) and 1 on failure(fetching blocked) 

Definition at line 186 of file rules.py.

00186                                  :
        """ See if the robots.txt file on the server
        allows fetching of this url. Return 0 on success
        (fetching allowed) and 1 on failure(fetching blocked) """

        # NOTE: Rewrote this method completely
        # on Nov 18 for 1.4 b2.

        # robots option turned off
        if self._configobj.robots==0: return False

        domport = urlObj.get_full_domain_with_port()

        url_directory = urlObj.get_url_directory()
        # Optimization #1: Check if this directory
        # is already there in the white list
        try:
            self._robocache.index(url_directory)
            return False
        except ValueError:
            pass

        # The robots.txt file url
        robotsfile = "".join((domport, '/robots.txt'))

        # if this url exists in filter list, return
        # True rightaway
        try:
            self._filter.index(urlObj.get_full_url())
            return True
        except ValueError:
            pass

        # Optimization #2: Maintain a cache
        # of robot parser objects to each
        # server.
        try:
            rp = self._robots[domport]
            # If there is an entry, but it
            # is None, it means there is no
            # robots.txt file in the server
            # (see below). So return False.
            if not rp:
                return False
        except KeyError:
            # Not there, create a fresh
            # one and add it.
            rp = robotparser.RobotFileParser()
            rp.set_url(robotsfile)
            ret = rp.read()
            if ret==-1:
                # no robots.txt file
                # Set the entry for this
                # server as None, so next
                # time we dont need to do
                # this operation again.
                self._robots[domport] = None
                return False
            else:
                # Set it
                self._robots[domport] = rp

        # Get user-agent from Spider
        ua = GetObject('USER_AGENT')

        if rp.can_fetch(ua, url_directory):
            # Add to white list
            self._robocache.append(url_directory)
            return False

        # Cannot fetch, so add to filter
        # for quick look up later.
        self.add_to_filter(urlObj.get_full_url())

        return True


Generated by  Doxygen 1.6.0   Back to index