
# ===== 将QQ邮箱内标题相同的水木合集邮件组按照发帖时间进行合并去重，并发布到Halo博客 =====

import imaplib
import email
from email.header import decode_header
import email.utils
import requests
import json
import re
import time
import datetime
import html
from collections import defaultdict

# ================= 1. 基础配置 =================
QQ_EMAIL = "8785@qq.com"          
QQ_AUTH_CODE = "nnfiemdejf"  # QQ邮箱授权码 设置-账号与安全-安全设置-生成授权码       
IMAP_SERVER = "imap.qq.com"

HALO_URL = "https://blog.sortie.com" 
HALO_PAT = "pat_eyJraWQiOiJr0ku5jO2o5v8G8"

# ================= 2. 规则配置 =================
TARGET_KEYWORD = "合集转寄"      
MAX_TARGET_EMAILS = 50           # 单次最大处理量（这里指处理多少个不重复的标题组）
MARK_AS_READ = True              # 发布成功后标记为已读

# 【新增控制开关】是否仅处理包含多封邮件的重复标题？
# 设为 True 时，单封非重复标题的邮件将被跳过；设为 False 则所有包含关键字的邮件都将被处理、内部去重并发布。
ONLY_PROCESS_DUPLICATES = True

# 【时间规则】不发布最近 N 天内发表的帖子 (填0则不限制时间)
IGNORE_RECENT_DAYS = 4           

# ================= 3. 样式与分类配置 =================
MAIN_TEXT_SIZE = "17px"          
QUOTE_TEXT_SIZE = "15px"         
POST_CATEGORY = "水木论坛"       
POST_TAG = "水木论坛"            
# ===============================================

# --- 预编译正则表达式 ---
COLOR_REGEX = re.compile(r'(?:\x1B)?\[[0-9;]*[mK]')
DELIMITER_REGEX = re.compile(r'(?:\u2606|☆)[\u2500\u2014\-\s]{5,}(?:\u2606|☆)')
BBCODE_REGEX = re.compile(r'\[/?(?:b|i|u|s|color|size|font|align|url|img|email|quote|code)(?:=[^\]]*)?\]', re.IGNORECASE)

def safe_decode(payload, charset):
    if not payload: return ""
    charset = charset.lower() if charset else 'utf-8'
    if 'gb' in charset or 'unknown' in charset:
        charset = 'gb18030'
    try:
        return payload.decode(charset)
    except UnicodeDecodeError:
        alt_charset = 'utf-8' if charset == 'gb18030' else 'gb18030'
        try:
            return payload.decode(alt_charset)
        except UnicodeDecodeError:
            return payload.decode(charset, 'ignore')
    except LookupError:
        return payload.decode('utf-8', 'ignore')

def decode_str(s):
    if not s: return ""
    try:
        value, charset = decode_header(s)[0]
        if isinstance(value, bytes):
            return safe_decode(value, charset)
        return value
    except Exception:
        return str(s)

def extract_email_body(msg):
    if msg.is_multipart():
        for part in msg.walk():
            ctype = part.get_content_type()
            cdisp = str(part.get("Content-Disposition"))
            if ctype == "text/plain" and "attachment" not in cdisp:
                return safe_decode(part.get_payload(decode=True), part.get_content_charset())
    else:
        return safe_decode(msg.get_payload(decode=True), msg.get_content_charset())
    return ""

def parse_sm_content(raw_text):
    if not raw_text.strip(): return "", []
    
    clean = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', raw_text)
    clean = clean.replace('\ufffd', '') 
    
    clean = COLOR_REGEX.sub('', clean)
    clean = BBCODE_REGEX.sub('', clean)
    
    lines = clean.split('\n')
    filtered_lines = []
    
    for line in lines:
        if DELIMITER_REGEX.search(line):
            filtered_lines.append(line)
            continue
        trimmed = line.strip()
        if not trimmed: continue
        
        f_idx, l_idx = trimmed.find('发自'), trimmed.find('来自')
        target_idx = min(f_idx, l_idx) if f_idx != -1 and l_idx != -1 else max(f_idx, l_idx)
        if target_idx != -1 and not re.search(r'[\u4e00-\u9fa5]', trimmed[:target_idx]):
            continue
        if not re.search(r'[A-Za-z0-9\u4e00-\u9fa5]', trimmed):
            continue
        filtered_lines.append(line)
        
    clean_text = '\n'.join(filtered_lines)
    blocks = DELIMITER_REGEX.split(clean_text)
    
    floors = []
    header_text = ''
    
    if len(blocks) == 1:
        floors = [{'author': '未知', 'nick': '', 'time': '', 'main': clean_text, 'quote': ''}]
    else:
        header_text = blocks[0].strip()
        for i in range(1, len(blocks)):
            block = blocks[i].strip()
            if not block: continue
            
            author, nick, time_str = '未知', '', ''
            meta_match = re.search(r'^([\s\S]*?)的大作中提到[:：](?:[\r\n]+([\s\S]*))?$', block)
            
            if meta_match:
                head = meta_match.group(1)
                content = meta_match.group(2) or ""
                u_match = re.search(r'^\s*(.*?)\s*\((.*?)\)', head)
                if u_match: author, nick = u_match.group(1).strip(), u_match.group(2).strip()
                t_match = re.search(r'于\s*\(?(.*?)\)?\s*在', head)
                if t_match: time_str = t_match.group(1).strip()
            else:
                content = block
                    
            q_match = re.search(r'【\s*在\s+[\s\S]*?的大作中提到[:：]\s*】', content)
            s_match = re.search(r'(--\s*FROM|-+\s*[发来]自|[发来]自「.*?」)', content, re.IGNORECASE)
            
            q_idx = q_match.start() if q_match else -1
            s_idx = s_match.start() if s_match else -1
            text_without_sig = content[:s_idx].strip() if s_idx != -1 else content
            main_text, quote_text = '', ''
            
            if q_idx != -1:
                before_quote = text_without_sig[:q_idx].strip()
                if before_quote == '':
                    t_lines = text_without_sig.split('\n')
                    quote_lines, main_lines = [], []
                    in_quote, header_passed = True, False
                    for t_line in t_lines:
                        if in_quote:
                            if not header_passed:
                                quote_lines.append(t_line)
                                if t_line.strip().endswith('】'): header_passed = True
                            elif t_line.strip().startswith(':') or t_line.strip().startswith('：') or t_line.strip() == '':
                                quote_lines.append(t_line)
                            else:
                                in_quote = False
                                main_lines.append(t_line)
                        else:
                            main_lines.append(t_line)
                    while quote_lines and quote_lines[-1].strip() == '': quote_lines.pop()
                    quote_text, main_text = '\n'.join(quote_lines).strip(), '\n'.join(main_lines).strip()
                else:
                    main_text, quote_text = text_without_sig[:q_idx].strip(), text_without_sig[q_idx:].strip()
            else:
                main_text = text_without_sig.strip()
                
            main_text = re.sub(r'\n{3,}', '\n\n', main_text)
            floors.append({'author': author, 'nick': nick, 'time': time_str, 'main': main_text, 'quote': quote_text})

    return header_text, floors

def render_sm_html(header_text, floors):
    unique_id = f"sm_{int(time.time()*1000)}"
    system_font = '\"Inter\", -apple-system, BlinkMacSystemFont, \"Segoe UI\", Roboto, \"Helvetica Neue\", Arial, sans-serif'
    mono_font = '\"Fira Code\", \"SFMono-Regular\", Consolas, \"Liberation Mono\", Menlo, Courier, monospace'

    html_out = f"""
    <div id="{unique_id}" class="sm-export-container dark-mode">
        <style>
            #{unique_id}, #{unique_id} * {{ -webkit-font-smoothing: antialiased !important; -moz-osx-font-smoothing: grayscale !important; text-shadow: none !important; -webkit-text-stroke: 0 !important; }}
            #{unique_id} {{ --bg: #121212; --card: #262626; --text: #e2e8f0; --sec: #94a3b8; --ter: #64748b; --border: #333333; --quote: rgba(0,0,0,0.2); --q-border: #3b82f6; --h-bg: rgba(0,0,0,0.3); font-family: {system_font} !important; padding: 20px 6px !important; transition: all 0.3s ease !important; background: var(--bg) !important; color: var(--text) !important; box-sizing: border-box !important; width: 100% !important; border-radius: 8px !important; }}
            #{unique_id}.light-mode {{ --bg: #f3f4f6; --card: #ffffff; --text: #111827; --sec: #4b5563; --ter: #9ca3af; --border: #e5e7eb; --quote: #f3f4f6; --q-border: #a3b8cc; --h-bg: #f9fafb; }}
            #{unique_id} .export-btn {{ margin-bottom: 16px !important; padding: 6px 14px !important; cursor: pointer !important; border-radius: 20px !important; border: 1px solid var(--border) !important; background: var(--card) !important; color: var(--text) !important; font-size: 13px !important; font-weight: 500 !important; transition: all 0.2s !important;}}
            #{unique_id} .export-btn:hover {{ border-color: var(--ter) !important; }}
            #{unique_id} .thread-header {{ font-family: {mono_font} !important; font-size: 12px !important; color: var(--ter) !important; margin-bottom: 16px !important; padding: 10px 14px !important; background: var(--h-bg) !important; border-radius: 8px !important; border: 1px solid var(--border) !important; line-height: 1.6 !important; }}
            #{unique_id} .floor-card {{ background: var(--card) !important; border: 1px solid var(--border) !important; border-radius: 8px !important; padding: 10px 4px !important; margin-bottom: 16px !important; text-align: left !important; }}
            #{unique_id} .floor-meta {{ display: flex !important; justify-content: space-between !important; margin-bottom: 8px !important; padding-bottom: 8px !important; border-bottom: 1px solid var(--border) !important; }}
            #{unique_id} .floor-meta-left strong {{ color: var(--text) !important; font-size: 15px !important; font-family: {system_font} !important; font-weight: 600 !important; }}
            #{unique_id} .floor-meta-left span {{ color: var(--sec) !important; font-size: 13px !important; margin-left: 8px !important; font-family: {system_font} !important; font-weight: normal !important; }}
            #{unique_id} .floor-meta-right {{ color: var(--ter) !important; font-family: {mono_font} !important; font-size: 12px !important; font-weight: normal !important; }}
            #{unique_id} .floor-body {{ font-size: {MAIN_TEXT_SIZE} !important; color: var(--text) !important; font-family: {system_font} !important; font-weight: normal !important; line-height: 1.8 !important; white-space: pre-wrap !important; word-break: break-word !important; letter-spacing: normal !important; margin: 0 !important; padding: 0 !important; }}
            #{unique_id} .floor-quote {{ margin-top: 10px !important; padding: 10px 14px !important; background: var(--quote) !important; border-left: 4px solid var(--q-border) !important; border-radius: 4px !important; font-family: {system_font} !important; font-size: {QUOTE_TEXT_SIZE} !important; color: var(--sec) !important; white-space: pre-wrap !important; line-height: 1.7 !important; font-weight: normal !important; }}
        </style>
        <button class="export-btn" onclick="document.getElementById('{unique_id}').classList.toggle('light-mode')">🌓 切换明暗主题</button>
    """
    if header_text:
        html_out += f'    <div class="thread-header">{html.escape(header_text).replace(chr(10), "<br>")}</div>\n'
    for idx, f in enumerate(floors):
        html_out += f"""    <div class="floor-card">
        <div class="floor-meta">
            <div class="floor-meta-left"><strong>{html.escape(f['author'])}</strong><span>{html.escape(f['nick'])}</span></div>
            <div class="floor-meta-right">{html.escape(f['time'])} &middot; #{idx+1}</div>
        </div>
        <div class="floor-body">{html.escape(f['main'])}</div>"""
        if f['quote']: html_out += f'\n        <div class="floor-quote">{html.escape(f["quote"])}</div>'
        html_out += "\n    </div>\n"
    html_out += "</div>"
    return html_out

def get_halo_metadata_name(session, resource_type, display_name):
    api_url = f"{HALO_URL}/apis/content.halo.run/v1alpha1/{resource_type}?size=100"
    try:
        resp = session.get(api_url)
        if resp.status_code == 200:
            for item in resp.json().get("items", []):
                if item.get("spec", {}).get("displayName") == display_name:
                    return item.get("metadata", {}).get("name")
    except Exception:
        pass
    return None

def get_iso_time(sm_time_str, email_date_str):
    dt = None
    if sm_time_str:
        sm_time_str = sm_time_str.strip()
        try:
            dt = datetime.datetime.strptime(sm_time_str, "%Y-%m-%d %H:%M:%S")
        except ValueError:
            pass
        if not dt:
            try:
                clean_time = " ".join(sm_time_str.split())
                month_map = {"Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6, 
                             "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12}
                m = re.search(r'[A-Za-z]{3}\s+([A-Za-z]{3})\s+(\d+)\s+(\d+):(\d+):(\d+)\s+(\d+)', clean_time)
                if m:
                    mon_str, day, H, M, S, year = m.groups()
                    mon = month_map.get(mon_str, 1)
                    dt = datetime.datetime(int(year), mon, int(day), int(H), int(M), int(S))
            except Exception:
                pass
    
    if dt:
        tz = datetime.timezone(datetime.timedelta(hours=8))
        dt = dt.replace(tzinfo=tz)
        return dt.isoformat(timespec='seconds')
        
    if email_date_str:
        try:
            dt = email.utils.parsedate_to_datetime(email_date_str)
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=datetime.timezone.utc)
            return dt.isoformat(timespec='seconds')
        except Exception:
            pass
            
    return datetime.datetime.now(datetime.timezone.utc).isoformat(timespec='seconds')

def post_to_halo(session, eid, title, html_content, category_id, tag_id, publish_time_iso):
    api_url = f"{HALO_URL}/apis/api.console.halo.run/v1alpha1/posts"
    unique_slug = f"sm-{eid}-{int(time.time() * 1000)}"
    clean_title = re.sub(r'[\r\n\t]+', ' ', title).strip()
    
    payload = {
        "post": {
            "apiVersion": "content.halo.run/v1alpha1",
            "kind": "Post",
            "metadata": {"generateName": "sm-post-"},
            "spec": {
                "title": clean_title,
                "slug": unique_slug,              
                "publish": True,          
                "publishTime": publish_time_iso,  
                "allowComment": True,
                "visible": "PUBLIC",
                "pinned": False,                  
                "priority": 0,                    
                "deleted": False,                 
                "excerpt": {"autoGenerate": True, "raw": ""}
            }
        },
        "content": {
            "raw": html_content,
            "content": html_content,
            "rawType": "html"
        }
    }
    
    if category_id: payload["post"]["spec"]["categories"] = [category_id]
    if tag_id: payload["post"]["spec"]["tags"] = [tag_id]
    
    response = session.post(api_url, json=payload)
    if response.status_code in [200, 201]:
        print(f"✅ 成功发布: {clean_title} [最新时间: {publish_time_iso}]")
        return True
    else:
        print(f"❌ 发布失败 ({response.status_code}): {response.text}")
        return False

def fetch_and_process():
    api_session = requests.Session()
    api_session.headers.update({
        "Authorization": f"Bearer {HALO_PAT}"
    })
    
    mail = None
    try:
        print(">> 步骤1: 获取 Halo 博客基建数据...")
        category_id = get_halo_metadata_name(api_session, "categories", POST_CATEGORY)
        tag_id = get_halo_metadata_name(api_session, "tags", POST_TAG)
        
        print(">> 步骤2: 连接 QQ 邮箱服务器...")
        mail = imaplib.IMAP4_SSL(IMAP_SERVER, 993)
        mail.login(QQ_EMAIL, QQ_AUTH_CODE)
        mail.select("INBOX")
        
        print(">> 正在向服务器发送关键字查询指令...")
        # 【修改1】使用 ALL 参数查询包含目标关键字的所有邮件（包括已读和未读）
        status, messages = mail.search('UTF-8', 'ALL', 'SUBJECT', TARGET_KEYWORD.encode('utf-8'))
        
        if status != "OK" or not messages[0]:
            print("📭 没有找到包含目标关键字的邮件。")
            return

        email_ids = messages[0].split()
        print(f"   (服务器过滤完成，共找到 {len(email_ids)} 封待检邮件)")
        
        # 倒序，优先处理新邮件
        email_ids.reverse() 
        
        # 【修改2】提取标题进行分组聚合
        print(">> 步骤3: 获取邮件头并按重复标题进行分组...")
        subject_groups = defaultdict(list)
        email_dates = {}
        
        for eid in email_ids:
            res, header_data = mail.fetch(eid, '(BODY.PEEK[HEADER.FIELDS (SUBJECT DATE)])')
            subject_str = ""
            email_date_str = ""
            for response_part in header_data:
                if isinstance(response_part, tuple):
                    msg_header = email.message_from_bytes(response_part[1])
                    subject_str = decode_str(msg_header["Subject"]).strip()
                    email_date_str = msg_header.get("Date", "")
                    break
            
            # 过滤包含目标关键字的有效邮件进行分组
            if TARGET_KEYWORD in subject_str:
                subject_groups[subject_str].append(eid)
                email_dates[eid] = email_date_str
                
        print(f"   (分组完成，共 {len(subject_groups)} 个独立的主题聚合组)")
        
        processed_count = 0
        print(f">> 步骤4: 开始合并并处理邮件内容 (目标最多处理 {MAX_TARGET_EMAILS} 组)...")
        
        for subject, group_eids in subject_groups.items():
            if processed_count >= MAX_TARGET_EMAILS:
                print(f"🛑 已达到最大处理数量 ({MAX_TARGET_EMAILS})，本轮任务结束。")
                break

            # 如果用户设定仅处理有重复标题（即聚合后数量 >= 2）的邮件
            if ONLY_PROCESS_DUPLICATES and len(group_eids) < 2:
                continue

            print(f"📬 正在处理聚合组: {subject} (组内包含 {len(group_eids)} 封邮件)")
            
            all_floors = []
            seen_signatures = set()
            merged_header_text = ""
            
            # 【修改3】读取组内所有重复邮件正文
            for eid in group_eids:
                res, msg_data = mail.fetch(eid, '(RFC822)')
                msg = None
                for response_part in msg_data:
                    if isinstance(response_part, tuple):
                        msg = email.message_from_bytes(response_part[1])
                        break
                        
                if not msg: continue

                raw_body = extract_email_body(msg)
                header, floors = parse_sm_content(raw_body)
                
                # 提取第一封有头部的邮件作为统一的顶部 header
                if header and not merged_header_text:
                    merged_header_text = header
                
                email_date_str = email_dates.get(eid, "")
                
                # 【修改4】合并并去重
                for f in floors:
                    author = f.get('author', '').strip()
                    time_str = f.get('time', '').strip()
                    main_content = f.get('main', '').strip()
                    
                    # 按照 发帖人+发布时间+内容 组成唯一指纹进行去重
                    sig = (author, time_str, main_content)
                    if sig not in seen_signatures:
                        seen_signatures.add(sig)
                        
                        # 转换出一个标准的 ISO 8601 字符串用于后续的绝对时间排序
                        sort_time = get_iso_time(time_str, email_date_str)
                        f['sort_key'] = sort_time
                        all_floors.append(f)
            
            if not all_floors:
                continue
                
            # 【修改5】将所有收集到的不重复帖子，按照时间先后顺序排序
            all_floors.sort(key=lambda x: x['sort_key'])
            
            # 生成 HTML（传入合并后的 Header 以及去重排序后的全量 Floors）
            formatted_html = render_sm_html(merged_header_text, all_floors)
            
            if formatted_html:
                # 提取原帖最新一次回复的时间，用于决定发布时间和最近天数屏蔽逻辑
                last_floor_time = all_floors[-1]['sort_key']
                
                if IGNORE_RECENT_DAYS > 0:
                    try:
                        dt_publish = datetime.datetime.fromisoformat(last_floor_time)
                        now_dt = datetime.datetime.now(datetime.timezone.utc)
                        age_days = (now_dt - dt_publish).days
                        if age_days < IGNORE_RECENT_DAYS:
                            print(f"⏳ 暂不发布: {subject} (原帖最新回复于 {age_days} 天前，未满 {IGNORE_RECENT_DAYS} 天)")
                            continue
                    except Exception:
                        pass 

                processed_count += 1
                
                # 发布到 Halo 博客，使用此聚合组中第一封邮件的 ID 生成唯一 Slug
                success = post_to_halo(
                    session=api_session, 
                    eid=group_eids[0].decode('utf-8'), 
                    title=subject, 
                    html_content=formatted_html, 
                    category_id=category_id, 
                    tag_id=tag_id, 
                    publish_time_iso=last_floor_time
                )
                
                # 如果发布成功，将该聚合组内参与合并的【所有】邮件全标记为已读
                if success and MARK_AS_READ:
                    for eid in group_eids:
                        mail.store(eid, '+FLAGS', '\\Seen')
        
        print(f"🎉 运行完毕，本次共发布了 {processed_count} 篇合集文章。")
        
    except Exception as e:
        print(f"❌ 发生网络或系统错误: {e}")
        
    finally:
        if mail:
            try:
                mail.logout()
                print("🔌 邮箱连接已安全断开。")
            except:
                pass
        api_session.close()

if __name__ == "__main__":
    fetch_and_process()
