import time import urllib.request import logging import re from bs4 import BeautifulSoup import requests from config import RSS_FEEDS, WP_POST_STATUS from db_manager import init_db, is_url_processed, mark_url_as_processed, is_similar_to_recent_posts from ai_processor import process_article from wp_poster import post_to_wordpress # Configure logging to write to both console and scraper.log file import os log_file_path = os.path.join(os.path.dirname(__file__), "scraper.log") logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", handlers=[ logging.FileHandler(log_file_path, encoding="utf-8"), logging.StreamHandler() ] ) def clean_url_for_comparison(url): """ Strips protocol, www, parameters, and trailing slashes to accurately match duplicate URLs. """ if not url: return "" url_clean = url.lower().replace("https://", "").replace("http://", "").replace("www.", "") url_clean = url_clean.split("?")[0].split("#")[0].rstrip("/") return url_clean def is_small_or_exclude_image(src, img): """ Checks if an image is small, a square avatar/headshot, logo, or icon. """ if not src: return True src_lower = src.lower() # Exclude SVG vector images if ".svg" in src_lower or src_lower.endswith(".svg"): return True # Exclude keywords in URL exclude_keywords = [ "logo", "icon", "avatar", "pixel", "badge", "ad-", "googleusercontent", "headshot", "author", "reporter", "writer", "staff", "profile", "user-", "thumbnail", "header", "footer", "byline", "square", "gravatar", "sharing", "facebook", "twitter", "email", "print", "pinterest" ] if any(x in src_lower for x in exclude_keywords): return True # Check width/height in img tag attributes width = img.get("width") height = img.get("height") try: if width and int(width) <= 200: return True if height and int(height) <= 200: return True except ValueError: pass # Check URL patterns for small sizes (e.g. resize/100, w=150, s=100, etc.) size_patterns = [ r'[/\=\-\_\?]w(?:idth)?[\=\-\_]?(?P\d+)', r'[/\=\-\_\?]h(?:eight)?[\=\-\_]?(?P\d+)', r'[/\=\-\_\?]s(?:ize)?[\=\-\_]?(?P\d+)', r'resize[/\=\-\_](?P\d+)', ] for pattern in size_patterns: matches = re.finditer(pattern, src_lower) for match in matches: size_val = int(match.group("size")) if size_val <= 200: return True # Also check if image has square aspect ratio and is small (e.g., 150x150) square_pattern = r'(\d+)x(\d+)' match = re.search(square_pattern, src_lower) if match: w_val = int(match.group(1)) h_val = int(match.group(2)) if w_val <= 200 or h_val <= 200: return True return False def fetch_rss_items(feed_url): """Fetches and parses feed items from an RSS/XML feed.""" logging.info(f"Fetching RSS feed: {feed_url}") items = [] headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36" } try: req = urllib.request.Request(feed_url, headers=headers) with urllib.request.urlopen(req, timeout=15) as response: xml_data = response.read() soup = BeautifulSoup(xml_data, "xml") # Try RSS format entry_list = soup.find_all("item") if not entry_list: # Try Atom format entry_list = soup.find_all("entry") for entry in entry_list: # Find title title_tag = entry.find("title") title = title_tag.text.strip() if title_tag else "No Title" # Find link link_tag = entry.find("link") link = "" if link_tag: if link_tag.get("href"): link = link_tag.get("href").strip() else: link = link_tag.text.strip() # Find description/content desc_tag = entry.find("description") if not desc_tag: desc_tag = entry.find("content:encoded") if not desc_tag: desc_tag = entry.find("summary") description = desc_tag.text.strip() if desc_tag else "" if title and link: items.append({ "title": title, "link": link, "description": description }) logging.info(f"Found {len(items)} items in feed.") return items except Exception as e: logging.error(f"Error fetching/parsing feed {feed_url}: {e}") return [] def extract_full_article_text(url): """ Attempts to fetch the webpage and extract the main article text and the main OpenGraph image (og:image). Returns a tuple: (full_text, image_url) """ logging.info(f"Extracting full text and image from page: {url}") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36" } try: response = requests.get(url, headers=headers, timeout=15) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") # Extract og:image (using urljoin to resolve relative paths to absolute URLs) image_url = None og_image_tag = soup.find("meta", property="og:image") if og_image_tag and og_image_tag.get("content"): from urllib.parse import urljoin image_url = urljoin(url, og_image_tag.get("content").strip()) logging.info(f"Found original cover image: {image_url}") # Remove script, style, header, footer elements for element in soup(["script", "style", "header", "footer", "nav", "aside"]): element.decompose() # Try to find the article body by common tags/classes article_body = soup.find("article") if not article_body: article_body = soup.find(class_="article-body") if not article_body: article_body = soup.find(class_="post-content") if not article_body: article_body = soup # fallback to entire body # Extract body images (up to 3 images, filtering out logos, icons, and small stuff) body_image_urls = [] if article_body: img_tags = article_body.find_all("img") for img in img_tags: src = None # Prioritize high-quality/original attributes over lazy-loading placeholders (e.g. data:image/...) for attr in ["data-src", "data-original", "data-srcset", "srcset", "src"]: val = img.get(attr) if val and not val.strip().startswith("data:"): val = val.strip() if attr in ["data-srcset", "srcset"]: parts = val.split(",") if parts: last_part = parts[-1].strip().split(" ")[0] if last_part.startswith("http") or last_part.startswith("//"): src = last_part break else: if val.startswith("http") or val.startswith("//"): src = val break if src: if src.startswith("//"): src = "https:" + src # Skip small, square, or author profile images if is_small_or_exclude_image(src, img): continue # Compare clean URLs to avoid duplicates (including featured image) if clean_url_for_comparison(src) != clean_url_for_comparison(image_url): # Check if already added to body_image_urls clean_existing = [clean_url_for_comparison(u) for u in body_image_urls] if clean_url_for_comparison(src) not in clean_existing: body_image_urls.append(src) if len(body_image_urls) >= 3: # limit to max 3 extra images break paragraphs = article_body.find_all("p") paragraph_texts = [p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 30] # Combine text paragraphs full_text = "\n\n".join(paragraph_texts) # Limit content size for Gemini to prevent excessive token usage return full_text[:8000], image_url, body_image_urls except Exception as e: logging.error(f"Error extracting full text from {url}: {e}") return "", None, [] def run_automation(max_posts_per_category=2): """ Main runner: iterates through feeds organized by category, identifies unprocessed items, rewrites them using AI, and publishes to WordPress with the correct category. """ from config import CATEGORY_FEEDS init_db() total_created = 0 for category_slug, feed_urls in CATEGORY_FEEDS.items(): posts_in_category = 0 logging.info(f"\n{'='*60}") logging.info(f"Processing category: {category_slug.upper()}") logging.info(f"{'='*60}") for feed_url in feed_urls: if posts_in_category >= max_posts_per_category: break items = fetch_rss_items(feed_url) for item in items: if posts_in_category >= max_posts_per_category: break url = item["link"] title = item["title"] # Check for betting/gambling content title_lower = title.lower() betting_keywords = [ "betting", "odds", "picks", "draftkings", "fanduel", "cá độ", "cá cược", "tỷ lệ cược", "nhà cái", "wager", "gamble", "gambling", "handicap", "spread", "over/under", "lineup advice", "fantasy picks", "bookmaker", "prediction" ] words = title_lower.translate(str.maketrans('', '', '.,!?#*()[]{}')).split() has_betting = any(kw in title_lower for kw in betting_keywords) or "bet" in words or "dfs" in words or "pick" in words if has_betting: logging.info(f"Skipping betting/gambling related article: {title}") continue # Check database to avoid duplicate posting (checking both clean URL and clean title) if is_url_processed(url, title): logging.info(f"Skipping already processed post (exact match): {title}") continue # Check title similarity with recent articles if is_similar_to_recent_posts(title, threshold=0.65): logging.info(f"Skipping similar post (similarity check): {title}") continue logging.info(f"Processing new article: {title}") # Get full content, image url & extra body images full_content, image_url, body_images = extract_full_article_text(url) # STRICT FILTER: Skip if full text could not be extracted (anti-scraping/blocked) if not full_content or len(full_content) < 400: logging.warning(f"Skipping '{title}' - failed to extract full text due to anti-scraping block.") continue # STRICT FILTER: Skip if no original cover image is available (or if it's a Google placeholder logo) if not image_url or "googleusercontent.com" in image_url: logging.warning(f"Skipping '{title}' - no valid original cover image found.") continue # Process content via Gemini logging.info("Sending content to NVIDIA API for rewriting...") processed_data = process_article(title, full_content) if not processed_data or not processed_data.get("content") or len(processed_data.get("content", "").strip()) < 200: logging.error(f"AI failed to process or returned insufficient content for article: {title}") continue # Force category from the feed mapping (override AI classification) processed_data["category"] = category_slug # Post rewritten content to WordPress (using configured status: publish or draft) logging.info(f"Publishing rewritten article to WordPress (category: {category_slug}, status: {WP_POST_STATUS})...") wp_post = post_to_wordpress(processed_data, status=WP_POST_STATUS, featured_image_url=image_url, body_image_urls=body_images) if wp_post: # Mark as processed in SQLite mark_url_as_processed(url, title) posts_in_category += 1 total_created += 1 logging.info(f"Saved state to database. Cool down for 10s...") time.sleep(10) else: logging.error(f"Failed to publish to WordPress: {title}") logging.info(f"\n{'='*60}") logging.info(f"AUTOMATION COMPLETE: {total_created} articles posted across all categories") logging.info(f"{'='*60}") if __name__ == "__main__": print("=== STARTING NEWS AUTOMATION RUN ===") run_automation(max_posts_per_category=2) # 2 posts per category = up to 10 total print("=== COMPLETED RUN ===")