Logo Search packages:      
Sourcecode: harvestman version File versions  Download package

def HarvestMan::rules::harvestManRulesChecker::__is_external_link (   self,
  urlObj 
) [private]

Check if the url is an external link relative to starting url,
using the download rules set by the user 

Definition at line 506 of file rules.py.

00506                                         :
        """ Check if the url is an external link relative to starting url,
        using the download rules set by the user """

        # Example.
        # Assume our start url is 'http://www.server.com/files/images/index.html"
        # Then any url which starts with another server name or at a level
        # above the start url's base directory on the same server is considered
        # an external url
        # i.e, http://www.yahoo.com will be external because of
        # 1st reason &
        # http://www.server.com/files/search.cgi will be external link because of
        # 2nd reason.
        # External links ?

        # if under the same starting directory, return False
        if self.is_under_starting_directory(urlObj):
            return False

        directory = urlObj.get_url_directory()

        tq = GetObject('trackerqueue')
        baseUrlObj = tq.get_base_urlobject()
        if not baseUrlObj:
            return False

        if urlObj.get_type() == 'stylesheet':
            if self._configobj.getstylesheets: return False

        elif urlObj.get_type() == 'image':
            if self._configobj.getimagelinks: return False

        if not self.is_external_server_link(urlObj):
            # print 'Same server ', urlObj.domain, baseUrlObj.domain
            if self._configobj.fetchlevel==0:
                return True
            elif self._configobj.fetchlevel==3:
                # check for the directory of the parent url
                # if it is same as starting directory, allow this
                # url, else deny
                try:
                    parentUrlObj = urlObj.get_base_urlobject()
                    if not parentUrlObj:
                        return False

                    parentdir = parentUrlObj.get_url_directory()
                    bdir = baseUrlObj.get_url_directory()

                    if parentdir == bdir:
                        self.__increment_ext_directory_count(directory)
                        return False
                    else:
                        return True
                except urlparser.HarvestManUrlParserError, e:
                    print e
            elif self._configobj.fetchlevel > 0:
                # this option takes precedence over the
                # extpagelinks option, so set extpagelinks
                # option to true.
                self._configobj.epagelinks=1
                # do other checks , just fall through

            # Increment external directory count
            directory = urlObj.get_url_directory()

            res=self.__ext_directory_check(directory)
            if not res:
                extrainfo("External directory error - filtered!")
                self.add_to_filter(urlObj.get_full_url())
                return True

            # Apply depth check for external dirs here
            if self._configobj.extdepth:
                if self.__apply_depth_check(urlObj, mode=2):
                    return True

            if self._configobj.epagelinks:
                # We can get external links belonging to same server,
                # so this is not an external link
                return False
            else:
                # We cannot get external links belonging to same server,
                # so this is an external link
                self.add_to_filter(urlObj.get_full_url())
                return True
        else:
            # print 'Different server ', urlObj.domain, baseUrlObj.domain
            # print 'Fetchlevel ', self._configobj.fetchlevel
            # Both belong to different base servers
            if self._configobj.fetchlevel==0 or self._configobj.fetchlevel == 1:
                return True
            elif self._configobj.fetchlevel==2 or self._configobj.fetchlevel==3:
                # check whether the baseurl (parent url of this url)
                # belongs to the starting server. If so allow fetching
                # else deny. ( we assume the baseurl path is not relative! :-)
                try:
                    parentUrlObj = urlObj.get_base_urlobject()
                    baseserver = baseUrlObj.get_domain()

                    if not parentUrlObj:
                        return False

                    server = urlObj.get_domain()
                    if parentUrlObj.get_domain() == baseserver:
                        self.__increment_ext_server_count(server)
                        return False
                    else:
                        return True
                except urlparser.HarvestManUrlParserError, e:
                    print e
            elif self._configobj.fetchlevel>3:
                # this option takes precedence over the
                # extserverlinks option, so set extserverlinks
                # option to true.
                self._configobj.eserverlinks=1
                # do other checks , just fall through

            res = self.__ext_server_check(urlObj.get_domain())

            if not res:
                self.add_to_filter(urlObj.get_full_url())
                return True

            # Apply filter for servers here
            if self.__apply_server_filter(urlObj):
                return True

            # Apply depth check for external servers here
            if self._configobj.extdepth:
                if self.__apply_depth_check(urlObj, mode=2):
                    return True

            if self._configobj.eserverlinks:
                # We can get links belonging to another server, so
                # this is NOT an external link
                return False
            else:
                # We cannot get external links beloning to another server,
                # so this is an external link
                self.add_to_filter(urlObj.get_full_url())
                return True

        # We should not reach here
        return False


Generated by  Doxygen 1.6.0   Back to index