#!/usr/bin/env python3
"""
Script to extract URLs from images.php and store them in SQLite3 database
"""

import re
import sqlite3
import os
from urllib.parse import urlparse

def extract_urls_from_file(file_path):
    """Extract all URLs from the images.php file"""
    urls = []
    
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Use regex to find all URLs
    url_pattern = r'https?://[^\s"\'<>]+'
    matches = re.findall(url_pattern, content)
    
    for url in matches:
        # Clean up the URL (remove any trailing characters that might have been captured)
        url = url.rstrip('.,;:!?')
        if url and url.startswith(('http://', 'https://')):
            urls.append(url)
    
    return urls

def create_database(db_path):
    """Create SQLite database with URLs table"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Create table for storing URLs
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS urls (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            url TEXT UNIQUE NOT NULL,
            domain TEXT,
            path TEXT,
            filename TEXT,
            created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
            processed BOOLEAN DEFAULT 0
        )
    ''')
    
    # Create index for faster lookups
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_domain ON urls(domain)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_processed ON urls(processed)')
    
    conn.commit()
    return conn

def parse_url_info(url):
    """Parse URL to extract domain, path, and filename"""
    parsed = urlparse(url)
    domain = parsed.netloc
    path = parsed.path
    filename = os.path.basename(path)
    
    return domain, path, filename

def import_urls_to_database(urls, db_path):
    """Import URLs into SQLite database"""
    conn = create_database(db_path)
    cursor = conn.cursor()
    
    imported_count = 0
    skipped_count = 0
    
    for url in urls:
        try:
            domain, path, filename = parse_url_info(url)
            
            # Try to insert the URL
            cursor.execute('''
                INSERT OR IGNORE INTO urls (url, domain, path, filename)
                VALUES (?, ?, ?, ?)
            ''', (url, domain, path, filename))
            
            if cursor.rowcount > 0:
                imported_count += 1
            else:
                skipped_count += 1
                
        except Exception as e:
            print(f"Error processing URL {url}: {e}")
            skipped_count += 1
    
    conn.commit()
    conn.close()
    
    return imported_count, skipped_count

def main():
    """Main function"""
    images_file = '/var/www/vynex.org/images.php'
    db_file = '/var/www/vynex.org/urls.db'
    
    print("Extracting URLs from images.php...")
    urls = extract_urls_from_file(images_file)
    print(f"Found {len(urls)} URLs")
    
    print("Creating SQLite database...")
    conn = create_database(db_file)
    conn.close()
    
    print("Importing URLs to database...")
    imported, skipped = import_urls_to_database(urls, db_file)
    
    print(f"\nImport completed!")
    print(f"Imported: {imported} URLs")
    print(f"Skipped (duplicates): {skipped} URLs")
    print(f"Database saved to: {db_file}")
    
    # Show some statistics
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    
    # Count total URLs
    cursor.execute('SELECT COUNT(*) FROM urls')
    total_count = cursor.fetchone()[0]
    
    # Count unique domains
    cursor.execute('SELECT COUNT(DISTINCT domain) FROM urls')
    unique_domains = cursor.fetchone()[0]
    
    # Show top domains
    cursor.execute('''
        SELECT domain, COUNT(*) as count 
        FROM urls 
        GROUP BY domain 
        ORDER BY count DESC 
        LIMIT 10
    ''')
    top_domains = cursor.fetchall()
    
    print(f"\nDatabase Statistics:")
    print(f"Total URLs: {total_count}")
    print(f"Unique domains: {unique_domains}")
    print(f"\nTop 10 domains:")
    for domain, count in top_domains:
        print(f"  {domain}: {count} URLs")
    
    conn.close()

if __name__ == "__main__":
    main()
