优化链接过滤器中的链接标准化函数

链接过滤器功能已增强,可消除链接中不必要的协议部分和初始斜线。该更新有助于更清晰地处理链接,同时保持其基本标识不变。
This commit is contained in:
wood 2024-09-05 13:28:28 +08:00
parent a1b9371afb
commit 39fe41eb83

View File

@ -54,13 +54,20 @@ class LinkFilter:
logger.info(f"Reloaded {len(self.keywords)} keywords and {len(self.whitelist)} whitelist entries")
def normalize_link(self, link):
# 移除协议部分(如 http:// 或 https://
link = re.sub(r'^https?://', '', link)
# 移除开头的双斜杠
link = link.lstrip('/')
parsed = urllib.parse.urlparse(f"http://{link}")
normalized = urllib.parse.urlunparse(('', parsed.netloc, parsed.path, parsed.params, parsed.query, ''))
result = normalized.rstrip('/')
logger.debug(f"Normalized link: {link} -> {result}")
return result
def is_whitelisted(self, link):
extracted = tldextract.extract(link)
domain = f"{extracted.domain}.{extracted.suffix}"