Logo Search packages:      
Sourcecode: harvestman version File versions  Download package

def HarvestMan::rules::harvestManRulesChecker::violates_basic_rules (   self,
  urlObj 
)

Check the basic rules for this url object,
This function returns True if the url object
violates the rules, else returns False 

Definition at line 52 of file rules.py.

00052                                           :
        """ Check the basic rules for this url object,
        This function returns True if the url object
        violates the rules, else returns False """

        url = urlObj.get_full_url()

        # if this url exists in filter list, return
        # True rightaway
        try:
            self._filter.index(url)
            return True
        except ValueError:
            pass

       # now apply the url filter
        if self.__apply_url_filter(url):
            extrainfo("Custom filter - filtered ", url)
            return True

        # now apply the junk filter
        if self.junkfilter:
            if not self.junkfilter.check(urlObj):
                extrainfo("Junk Filter - filtered", url)
                return True
                                                                                                                 
        # now apply REP
        if self.__apply_rep(urlObj):
            extrainfo("Robots.txt rules prevents download of ", url)
            return True

        # check if this is an external link
        if self.__is_external_link( urlObj ):
            extrainfo("External link - filtered ", urlObj.get_full_url())
            return True

        # depth check
        if self.__apply_depth_check(urlObj):
            extrainfo("Depth exceeds - filtered ", urlObj.get_full_url())
            return True

        return False


Generated by  Doxygen 1.6.0   Back to index