import time
import urllib.request
import logging
import re
from bs4 import BeautifulSoup
import requests

from config import RSS_FEEDS, WP_POST_STATUS
from db_manager import init_db, is_url_processed, mark_url_as_processed, is_similar_to_recent_posts
from ai_processor import process_article
from wp_poster import post_to_wordpress

# Configure logging to write to both console and scraper.log file
import os
log_file_path = os.path.join(os.path.dirname(__file__), "scraper.log")
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler(log_file_path, encoding="utf-8"),
        logging.StreamHandler()
    ]
)

def clean_url_for_comparison(url):
    """
    Strips protocol, www, parameters, and trailing slashes to accurately match duplicate URLs.
    """
    if not url:
        return ""
    url_clean = url.lower().replace("https://", "").replace("http://", "").replace("www.", "")
    url_clean = url_clean.split("?")[0].split("#")[0].rstrip("/")
    return url_clean

def is_small_or_exclude_image(src, img):
    """
    Checks if an image is small, a square avatar/headshot, logo, or icon.
    """
    if not src:
        return True
        
    src_lower = src.lower()
    
    # Exclude SVG vector images
    if ".svg" in src_lower or src_lower.endswith(".svg"):
        return True
        
    # Exclude keywords in URL
    exclude_keywords = [
        "logo", "icon", "avatar", "pixel", "badge", "ad-", "googleusercontent",
        "headshot", "author", "reporter", "writer", "staff", "profile", "user-",
        "thumbnail", "header", "footer", "byline", "square", "gravatar", "sharing",
        "facebook", "twitter", "email", "print", "pinterest"
    ]
    if any(x in src_lower for x in exclude_keywords):
        return True
        
    # Check width/height in img tag attributes
    width = img.get("width")
    height = img.get("height")
    try:
        if width and int(width) <= 200:
            return True
        if height and int(height) <= 200:
            return True
    except ValueError:
        pass
        
    # Check URL patterns for small sizes (e.g. resize/100, w=150, s=100, etc.)
    size_patterns = [
        r'[/\=\-\_\?]w(?:idth)?[\=\-\_]?(?P<size>\d+)',
        r'[/\=\-\_\?]h(?:eight)?[\=\-\_]?(?P<size>\d+)',
        r'[/\=\-\_\?]s(?:ize)?[\=\-\_]?(?P<size>\d+)',
        r'resize[/\=\-\_](?P<size>\d+)',
    ]
    for pattern in size_patterns:
        matches = re.finditer(pattern, src_lower)
        for match in matches:
            size_val = int(match.group("size"))
            if size_val <= 200:
                return True
                
    # Also check if image has square aspect ratio and is small (e.g., 150x150)
    square_pattern = r'(\d+)x(\d+)'
    match = re.search(square_pattern, src_lower)
    if match:
        w_val = int(match.group(1))
        h_val = int(match.group(2))
        if w_val <= 200 or h_val <= 200:
            return True
            
    return False

def fetch_rss_items(feed_url):
    """Fetches and parses feed items from an RSS/XML feed."""
    logging.info(f"Fetching RSS feed: {feed_url}")
    items = []
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
    }
    
    try:
        req = urllib.request.Request(feed_url, headers=headers)
        with urllib.request.urlopen(req, timeout=15) as response:
            xml_data = response.read()
            
        soup = BeautifulSoup(xml_data, "xml")
        
        # Try RSS format <item>
        entry_list = soup.find_all("item")
        if not entry_list:
            # Try Atom format <entry>
            entry_list = soup.find_all("entry")
            
        for entry in entry_list:
            # Find title
            title_tag = entry.find("title")
            title = title_tag.text.strip() if title_tag else "No Title"
            
            # Find link
            link_tag = entry.find("link")
            link = ""
            if link_tag:
                if link_tag.get("href"):
                    link = link_tag.get("href").strip()
                else:
                    link = link_tag.text.strip()
                    
            # Find description/content
            desc_tag = entry.find("description")
            if not desc_tag:
                desc_tag = entry.find("content:encoded")
            if not desc_tag:
                desc_tag = entry.find("summary")
                
            description = desc_tag.text.strip() if desc_tag else ""
            
            if title and link:
                items.append({
                    "title": title,
                    "link": link,
                    "description": description
                })
                
        logging.info(f"Found {len(items)} items in feed.")
        return items
        
    except Exception as e:
        logging.error(f"Error fetching/parsing feed {feed_url}: {e}")
        return []

def extract_full_article_text(url):
    """
    Attempts to fetch the webpage and extract the main article text 
    and the main OpenGraph image (og:image).
    Returns a tuple: (full_text, image_url)
    """
    logging.info(f"Extracting full text and image from page: {url}")
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract og:image (using urljoin to resolve relative paths to absolute URLs)
        image_url = None
        og_image_tag = soup.find("meta", property="og:image")
        if og_image_tag and og_image_tag.get("content"):
            from urllib.parse import urljoin
            image_url = urljoin(url, og_image_tag.get("content").strip())
            logging.info(f"Found original cover image: {image_url}")
            
        # Remove script, style, header, footer elements
        for element in soup(["script", "style", "header", "footer", "nav", "aside"]):
            element.decompose()
            
        # Try to find the article body by common tags/classes
        article_body = soup.find("article")
        if not article_body:
            article_body = soup.find(class_="article-body")
        if not article_body:
            article_body = soup.find(class_="post-content")
        if not article_body:
            article_body = soup  # fallback to entire body
            
        # Extract body images (up to 3 images, filtering out logos, icons, and small stuff)
        body_image_urls = []
        if article_body:
            img_tags = article_body.find_all("img")
            
            for img in img_tags:
                src = None
                # Prioritize high-quality/original attributes over lazy-loading placeholders (e.g. data:image/...)
                for attr in ["data-src", "data-original", "data-srcset", "srcset", "src"]:
                    val = img.get(attr)
                    if val and not val.strip().startswith("data:"):
                        val = val.strip()
                        if attr in ["data-srcset", "srcset"]:
                            parts = val.split(",")
                            if parts:
                                last_part = parts[-1].strip().split(" ")[0]
                                if last_part.startswith("http") or last_part.startswith("//"):
                                    src = last_part
                                    break
                        else:
                            if val.startswith("http") or val.startswith("//"):
                                src = val
                                break
                
                if src:
                    if src.startswith("//"):
                        src = "https:" + src
                        
                    # Skip small, square, or author profile images
                    if is_small_or_exclude_image(src, img):
                        continue
                        
                    # Compare clean URLs to avoid duplicates (including featured image)
                    if clean_url_for_comparison(src) != clean_url_for_comparison(image_url):
                        # Check if already added to body_image_urls
                        clean_existing = [clean_url_for_comparison(u) for u in body_image_urls]
                        if clean_url_for_comparison(src) not in clean_existing:
                            body_image_urls.append(src)
                            if len(body_image_urls) >= 3: # limit to max 3 extra images
                                break
            
        paragraphs = article_body.find_all("p")
        paragraph_texts = [p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 30]
        
        # Combine text paragraphs
        full_text = "\n\n".join(paragraph_texts)
        
        # Limit content size for Gemini to prevent excessive token usage
        return full_text[:8000], image_url, body_image_urls
        
    except Exception as e:
        logging.error(f"Error extracting full text from {url}: {e}")
        return "", None, []

def run_automation(max_posts_per_category=2):
    """
    Main runner: iterates through feeds organized by category, identifies unprocessed items, 
    rewrites them using AI, and publishes to WordPress with the correct category.
    """
    from config import CATEGORY_FEEDS
    init_db()
    total_created = 0
    
    for category_slug, feed_urls in CATEGORY_FEEDS.items():
        posts_in_category = 0
        logging.info(f"\n{'='*60}")
        logging.info(f"Processing category: {category_slug.upper()}")
        logging.info(f"{'='*60}")
        
        for feed_url in feed_urls:
            if posts_in_category >= max_posts_per_category:
                break
                
            items = fetch_rss_items(feed_url)
            
            for item in items:
                if posts_in_category >= max_posts_per_category:
                    break
                    
                url = item["link"]
                title = item["title"]
                
                # Check for betting/gambling content
                title_lower = title.lower()
                betting_keywords = [
                    "betting", "odds", "picks", "draftkings", "fanduel", "cá độ", "cá cược", 
                    "tỷ lệ cược", "nhà cái", "wager", "gamble", "gambling", "handicap", "spread", 
                    "over/under", "lineup advice", "fantasy picks", "bookmaker", "prediction"
                ]
                
                words = title_lower.translate(str.maketrans('', '', '.,!?#*()[]{}')).split()
                has_betting = any(kw in title_lower for kw in betting_keywords) or "bet" in words or "dfs" in words or "pick" in words
                
                if has_betting:
                    logging.info(f"Skipping betting/gambling related article: {title}")
                    continue
                
                # Check database to avoid duplicate posting (checking both clean URL and clean title)
                if is_url_processed(url, title):
                    logging.info(f"Skipping already processed post (exact match): {title}")
                    continue
                    
                # Check title similarity with recent articles
                if is_similar_to_recent_posts(title, threshold=0.65):
                    logging.info(f"Skipping similar post (similarity check): {title}")
                    continue
                    
                logging.info(f"Processing new article: {title}")
                
                # Get full content, image url & extra body images
                full_content, image_url, body_images = extract_full_article_text(url)
                
                # STRICT FILTER: Skip if full text could not be extracted (anti-scraping/blocked)
                if not full_content or len(full_content) < 400:
                    logging.warning(f"Skipping '{title}' - failed to extract full text due to anti-scraping block.")
                    continue
                    
                # STRICT FILTER: Skip if no original cover image is available (or if it's a Google placeholder logo)
                if not image_url or "googleusercontent.com" in image_url:
                    logging.warning(f"Skipping '{title}' - no valid original cover image found.")
                    continue
                    
                # Process content via Gemini
                logging.info("Sending content to NVIDIA API for rewriting...")
                processed_data = process_article(title, full_content)
                
                if not processed_data or not processed_data.get("content") or len(processed_data.get("content", "").strip()) < 200:
                    logging.error(f"AI failed to process or returned insufficient content for article: {title}")
                    continue
                
                # Force category from the feed mapping (override AI classification)
                processed_data["category"] = category_slug
                    
                # Post rewritten content to WordPress (using configured status: publish or draft)
                logging.info(f"Publishing rewritten article to WordPress (category: {category_slug}, status: {WP_POST_STATUS})...")
                wp_post = post_to_wordpress(processed_data, status=WP_POST_STATUS, featured_image_url=image_url, body_image_urls=body_images)
                
                if wp_post:
                    # Mark as processed in SQLite
                    mark_url_as_processed(url, title)
                    posts_in_category += 1
                    total_created += 1
                    logging.info(f"Saved state to database. Cool down for 10s...")
                    time.sleep(10)
                else:
                    logging.error(f"Failed to publish to WordPress: {title}")
    
    logging.info(f"\n{'='*60}")
    logging.info(f"AUTOMATION COMPLETE: {total_created} articles posted across all categories")
    logging.info(f"{'='*60}")

if __name__ == "__main__":
    print("=== STARTING NEWS AUTOMATION RUN ===")
    run_automation(max_posts_per_category=2)  # 2 posts per category = up to 10 total
    print("=== COMPLETED RUN ===")
