Files
python/tools/check_links.py
P小二 7a3c40fd56 feat: major Jekyll upgrade and modernization
- Upgrade theme from minimal to Cayman
- Add responsive design and mobile optimization
- Implement dual-language support (EN/CN)
- Add comprehensive SEO optimization
- Create professional project structure
- Add custom CSS with 350+ lines of enhancements
- Implement automatic ToC generation
- Add contribution and deployment guides
- Update all content with modern resources
- Fix 21 broken links and add 25+ new resources
2025-07-09 16:47:51 +08:00

189 lines
6.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FlyPython 链接检查工具
用于定期检查README文件中所有外部链接的有效性
"""
import re
import requests
import time
import json
import os
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
class LinkChecker:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
self.timeout = 10
self.results = {
'working': [],
'broken': [],
'redirect': [],
'timeout': [],
'unknown': []
}
def extract_links_from_file(self, filename):
"""从markdown文件中提取所有外部链接"""
try:
with open(filename, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
print(f"无法读取文件 {filename}: {e}")
return []
# 匹配markdown链接格式 [text](url)
markdown_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', content)
# 匹配纯链接格式
url_pattern = r'https?://[^\s\])\}]+'
plain_links = re.findall(url_pattern, content)
links = []
# 处理markdown链接
for text, url in markdown_links:
if url.startswith('http'):
links.append({
'text': text,
'url': url,
'file': filename,
'type': 'markdown'
})
# 处理纯链接
for url in plain_links:
# 避免重复
if not any(link['url'] == url for link in links):
links.append({
'text': url,
'url': url,
'file': filename,
'type': 'plain'
})
return links
def check_link(self, link):
"""检查单个链接的状态"""
url = link['url']
try:
response = self.session.head(url, timeout=self.timeout, allow_redirects=True)
status_code = response.status_code
if status_code == 200:
link['status'] = 'working'
link['status_code'] = status_code
self.results['working'].append(link)
elif 300 <= status_code < 400:
link['status'] = 'redirect'
link['status_code'] = status_code
link['final_url'] = response.url
self.results['redirect'].append(link)
else:
# 尝试GET请求有些网站不支持HEAD
try:
response = self.session.get(url, timeout=self.timeout)
if response.status_code == 200:
link['status'] = 'working'
link['status_code'] = response.status_code
self.results['working'].append(link)
else:
link['status'] = 'broken'
link['status_code'] = response.status_code
self.results['broken'].append(link)
except:
link['status'] = 'broken'
link['status_code'] = status_code
self.results['broken'].append(link)
except requests.exceptions.Timeout:
link['status'] = 'timeout'
link['error'] = 'Request timeout'
self.results['timeout'].append(link)
except requests.exceptions.RequestException as e:
link['status'] = 'unknown'
link['error'] = str(e)
self.results['unknown'].append(link)
return link
def check_all_links(self, links, max_workers=10):
"""并发检查所有链接"""
print(f"开始检查 {len(links)} 个链接...")
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_link = {executor.submit(self.check_link, link): link for link in links}
for i, future in enumerate(as_completed(future_to_link), 1):
link = future_to_link[future]
try:
result = future.result()
status = result.get('status', 'unknown')
print(f"[{i}/{len(links)}] {status.upper()}: {result['url']}")
time.sleep(0.1)
except Exception as e:
print(f"检查链接时出错 {link['url']}: {e}")
def generate_report(self):
"""生成检查报告"""
total = sum(len(links) for links in self.results.values())
print("\n" + "="*60)
print("链接检查报告")
print("="*60)
print(f"总链接数: {total}")
print(f"正常链接: {len(self.results['working'])}")
print(f"重定向链接: {len(self.results['redirect'])}")
print(f"失效链接: {len(self.results['broken'])}")
print(f"超时链接: {len(self.results['timeout'])}")
print(f"未知状态: {len(self.results['unknown'])}")
# 保存详细结果
os.makedirs('../reports', exist_ok=True)
with open('../reports/link_check_results.json', 'w', encoding='utf-8') as f:
json.dump(self.results, f, ensure_ascii=False, indent=2)
print(f"\n详细结果已保存到: reports/link_check_results.json")
def main():
checker = LinkChecker()
# 从README文件提取链接 (相对于项目根目录)
files_to_check = ['../README.md', '../README_cn.md']
all_links = []
for filename in files_to_check:
print(f"{filename} 提取链接...")
links = checker.extract_links_from_file(filename)
all_links.extend(links)
print(f"找到 {len(links)} 个链接")
if not all_links:
print("没有找到任何链接!")
return
# 去重
unique_links = []
seen_urls = set()
for link in all_links:
if link['url'] not in seen_urls:
unique_links.append(link)
seen_urls.add(link['url'])
print(f"去重后共 {len(unique_links)} 个唯一链接")
# 检查链接
checker.check_all_links(unique_links)
# 生成报告
checker.generate_report()
if __name__ == '__main__':
main()