diff --git a/src/link_filter.py b/src/link_filter.py index bf0ac55..56f09df 100644 --- a/src/link_filter.py +++ b/src/link_filter.py @@ -54,13 +54,20 @@ class LinkFilter: logger.info(f"Reloaded {len(self.keywords)} keywords and {len(self.whitelist)} whitelist entries") def normalize_link(self, link): + # 移除协议部分(如 http:// 或 https://) link = re.sub(r'^https?://', '', link) + + # 移除开头的双斜杠 + link = link.lstrip('/') + parsed = urllib.parse.urlparse(f"http://{link}") normalized = urllib.parse.urlunparse(('', parsed.netloc, parsed.path, parsed.params, parsed.query, '')) result = normalized.rstrip('/') + logger.debug(f"Normalized link: {link} -> {result}") return result + def is_whitelisted(self, link): extracted = tldextract.extract(link) domain = f"{extracted.domain}.{extracted.suffix}"