import sqlite3
import os
import re
import difflib

DB_NAME = os.path.join(os.path.dirname(__file__), "processed_posts.db")

def clean_url_for_comparison(url):
    """Strips protocol, www, parameters, and trailing slashes to accurately match duplicate URLs."""
    if not url:
        return ""
    url_clean = url.lower().replace("https://", "").replace("http://", "").replace("www.", "")
    url_clean = url_clean.split("?")[0].split("#")[0].rstrip("/")
    return url_clean

def clean_title_for_comparison(title):
    """Lowercases and strips all non-alphanumeric characters for title duplicate checks."""
    if not title:
        return ""
    return re.sub(r'[^a-zA-Z0-9]', '', title).lower()

def init_db():
    """Initializes the database and creates the processed_posts table if it doesn't exist."""
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS processed_posts (
            url TEXT PRIMARY KEY,
            title TEXT,
            clean_url TEXT,
            clean_title TEXT,
            processed_at DATETIME DEFAULT CURRENT_TIMESTAMP
        )
    """)
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_clean_url ON processed_posts(clean_url)")
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_clean_title ON processed_posts(clean_title)")
    conn.commit()
    conn.close()

def is_url_processed(url, title=None):
    """Checks if a URL or title has already been processed and logged in the DB."""
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    
    clean_url = clean_url_for_comparison(url)
    clean_title = clean_title_for_comparison(title)
    
    if clean_title:
        cursor.execute("SELECT 1 FROM processed_posts WHERE clean_url = ? OR clean_title = ?", (clean_url, clean_title))
    else:
        cursor.execute("SELECT 1 FROM processed_posts WHERE clean_url = ?", (clean_url,))
        
    result = cursor.fetchone()
    conn.close()
    return result is not None

def mark_url_as_processed(url, title):
    """Saves a URL and its clean versions to the database to mark it as processed."""
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    
    clean_url = clean_url_for_comparison(url)
    clean_title = clean_title_for_comparison(title)
    
    try:
        cursor.execute(
            "INSERT INTO processed_posts (url, title, clean_url, clean_title) VALUES (?, ?, ?, ?)", 
            (url, title, clean_url, clean_title)
        )
        conn.commit()
    except sqlite3.IntegrityError:
        pass  # Already exists
    finally:
        conn.close()

def is_similar_to_recent_posts(title, threshold=0.65):
    """
    Checks if the title is similar to any recently processed titles in the database.
    Uses difflib.SequenceMatcher ratio.
    """
    if not title:
        return False
        
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    
    # Fetch the 100 most recent processed titles
    cursor.execute("SELECT title FROM processed_posts ORDER BY processed_at DESC LIMIT 100")
    rows = cursor.fetchall()
    conn.close()
    
    new_title_clean = clean_title_for_comparison(title)
    if not new_title_clean:
        return False
        
    for row in rows:
        recent_title = row[0]
        recent_title_clean = clean_title_for_comparison(recent_title)
        if not recent_title_clean:
            continue
            
        # Calculate similarity ratio
        ratio = difflib.SequenceMatcher(None, new_title_clean, recent_title_clean).ratio()
        if ratio >= threshold:
            return True
            
    return False

