
# ====== 删除Halo博客中重复的水木合集，只保留最新创建的文章，文章发布时间仍与帖子最后回复时间保持一致 =======

import requests
from collections import defaultdict

# ================= 配置信息 =================
HALO_URL = "https://blog.sortie.com"       
HALO_PAT = "pat_eyJrAChj-z7gYaVyAA5jO2o5v8G8"

# 诊断关键词
DEBUG_KEYWORD = "合集转寄"
# ============================================

def clean_duplicate_posts():
    api_session = requests.Session()
    api_session.headers.update({"Authorization": f"Bearer {HALO_PAT}"})
    
    print(">> 正在拉取博客底层文章数据...\n")
    posts = []
    page = 1
    
    while True:
        api_url = f"{HALO_URL}/apis/content.halo.run/v1alpha1/posts?page={page}&size=100"
        resp = api_session.get(api_url)
        if resp.status_code != 200:
            print(f"❌ 请求失败 ({resp.status_code}): {resp.text}")
            return
            
        data = resp.json()
        items = data.get("items", [])
        if not items:
            break
            
        posts.extend(items)
        if not data.get("hasNext", False) and len(items) < 100:
            break
        page += 1

    # ====== 新增：显示总拉取文章数 ======
    print(f">> ✅ 数据拉取完毕！底层接口共返回了 {len(posts)} 篇文章。\n")
    
    title_map = defaultdict(list)
    
    for post in posts:
        spec = post.get("spec", {})
        title = spec.get("title", "")
        
        # 只筛选包含目标关键词的文章
        if DEBUG_KEYWORD in title:
            metadata = post.get("metadata", {})
            title_map[title].append({
                "name": metadata.get("name", ""),          # 删除所需的资源标识名
                "creation_time": metadata.get("creationTimestamp", ""),
                "slug": spec.get("slug", "")
            })
    
    duplicate_count = 0
    deleted_count = 0
    print("-" * 75)
    
    for title, items in title_map.items():
        if len(items) > 1:
            duplicate_count += 1
            print(f"📦 处理重复标题: {repr(title)} (共 {len(items)} 篇)")
            
            # 降序排列，保证索引 0 是最新创建的
            sorted_items = sorted(items, key=lambda x: x["creation_time"], reverse=True)
            
            for idx, item in enumerate(sorted_items):
                post_name = item['name']
                
                if idx == 0:
                    print(f"   ✅ [保留] 创建时间: {item['creation_time']} | 链接: /archives/{item['slug']}")
                else:
                    print(f"   🗑️ [正在删除] 创建时间: {item['creation_time']} | 资源: {post_name} ...", end=" ")
                    
                    # 针对唯一名称发起 DELETE 请求
                    delete_url = f"{HALO_URL}/apis/content.halo.run/v1alpha1/posts/{post_name}"
                    del_resp = api_session.delete(delete_url)
                    
                    if del_resp.status_code in [200, 202, 204]:
                        print("成功")
                        deleted_count += 1
                    else:
                        print(f"失败 ({del_resp.status_code}: {del_resp.text})")
            print("-" * 75)
            
    if duplicate_count > 0:
        print(f"⚠️ 清理完毕：共处理 {duplicate_count} 组重复文章，成功删除 {deleted_count} 篇旧文章。")
    else:
        print(f"🎉 检查完毕：没有发现需要清理的重复文章。")

    api_session.close()

if __name__ == "__main__":
    clean_duplicate_posts()
