From 39fe41eb8382dfec4b7ebf4d76887c94957d35e3 Mon Sep 17 00:00:00 2001 From: wood Date: Thu, 5 Sep 2024 13:28:28 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E9=93=BE=E6=8E=A5=E8=BF=87?= =?UTF-8?q?=E6=BB=A4=E5=99=A8=E4=B8=AD=E7=9A=84=E9=93=BE=E6=8E=A5=E6=A0=87?= =?UTF-8?q?=E5=87=86=E5=8C=96=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 链接过滤器功能已增强,可消除链接中不必要的协议部分和初始斜线。该更新有助于更清晰地处理链接,同时保持其基本标识不变。 --- src/link_filter.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/link_filter.py b/src/link_filter.py index bf0ac55..56f09df 100644 --- a/src/link_filter.py +++ b/src/link_filter.py @@ -54,13 +54,20 @@ class LinkFilter: logger.info(f"Reloaded {len(self.keywords)} keywords and {len(self.whitelist)} whitelist entries") def normalize_link(self, link): + # 移除协议部分(如 http:// 或 https://) link = re.sub(r'^https?://', '', link) + + # 移除开头的双斜杠 + link = link.lstrip('/') + parsed = urllib.parse.urlparse(f"http://{link}") normalized = urllib.parse.urlunparse(('', parsed.netloc, parsed.path, parsed.params, parsed.query, '')) result = normalized.rstrip('/') + logger.debug(f"Normalized link: {link} -> {result}") return result + def is_whitelisted(self, link): extracted = tldextract.extract(link) domain = f"{extracted.domain}.{extracted.suffix}"