#!/usr/bin/env python3
"""
Image Downloader Script
Downloads images from URLs in SQLite database with rate limiting and progress tracking
"""

import sqlite3
import requests
import os
import time
import hashlib
from urllib.parse import urlparse
from pathlib import Path
import logging
from datetime import datetime

class ImageDownloader:
    def __init__(self, db_path, images_folder, delay_seconds=3):
        self.db_path = db_path
        self.images_folder = Path(images_folder)
        self.delay_seconds = delay_seconds
        self.session = requests.Session()
        
        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('image_downloader.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
        
        # Create images folder if it doesn't exist
        self.images_folder.mkdir(exist_ok=True)
        
        # Setup session headers
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def get_next_url_to_download(self):
        """Get the next URL that hasn't been downloaded yet"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            SELECT id, url, filename FROM urls 
            WHERE processed = 0 
            ORDER BY id 
            LIMIT 1
        ''')
        
        result = cursor.fetchone()
        conn.close()
        
        return result

    def get_download_stats(self):
        """Get download statistics"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # Total URLs
        cursor.execute('SELECT COUNT(*) FROM urls')
        total = cursor.fetchone()[0]
        
        # Downloaded URLs
        cursor.execute('SELECT COUNT(*) FROM urls WHERE processed = 1')
        downloaded = cursor.fetchone()[0]
        
        # Remaining URLs
        remaining = total - downloaded
        
        conn.close()
        
        return total, downloaded, remaining

    def generate_filename(self, url, original_filename):
        """Generate a unique filename for the image"""
        # Create a hash of the URL for uniqueness
        url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
        
        # Get file extension from original filename or URL
        if original_filename and '.' in original_filename:
            ext = original_filename.split('.')[-1]
        else:
            ext = 'jpg'  # Default to jpg for images
        
        # Create filename: hash_originalname.ext
        safe_original = original_filename.replace(' ', '_') if original_filename else 'image'
        filename = f"{url_hash}_{safe_original}"
        
        return filename

    def download_image(self, url, filename):
        """Download a single image"""
        try:
            self.logger.info(f"Downloading: {url}")
            
            response = self.session.get(url, timeout=30, stream=True)
            response.raise_for_status()
            
            # Determine content type
            content_type = response.headers.get('content-type', '')
            if not content_type.startswith('image/'):
                self.logger.warning(f"URL doesn't appear to be an image: {content_type}")
                return False
            
            # Save the image
            file_path = self.images_folder / filename
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            
            # Verify file was saved and has content
            if file_path.exists() and file_path.stat().st_size > 0:
                self.logger.info(f"Successfully downloaded: {filename} ({file_path.stat().st_size} bytes)")
                return True
            else:
                self.logger.error(f"Downloaded file is empty: {filename}")
                return False
                
        except requests.exceptions.RequestException as e:
            self.logger.error(f"Request failed for {url}: {e}")
            return False
        except Exception as e:
            self.logger.error(f"Unexpected error downloading {url}: {e}")
            return False

    def mark_as_downloaded(self, url_id, success=True):
        """Mark URL as processed in database"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        if success:
            cursor.execute('''
                UPDATE urls 
                SET processed = 1, downloaded_at = CURRENT_TIMESTAMP 
                WHERE id = ?
            ''', (url_id,))
        else:
            cursor.execute('''
                UPDATE urls 
                SET processed = 1, download_failed = 1, downloaded_at = CURRENT_TIMESTAMP 
                WHERE id = ?
            ''', (url_id,))
        
        conn.commit()
        conn.close()

    def add_download_columns(self):
        """Add additional columns to track download status"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # Add columns if they don't exist
        try:
            cursor.execute('ALTER TABLE urls ADD COLUMN downloaded_at DATETIME')
        except sqlite3.OperationalError:
            pass  # Column already exists
        
        try:
            cursor.execute('ALTER TABLE urls ADD COLUMN download_failed BOOLEAN DEFAULT 0')
        except sqlite3.OperationalError:
            pass  # Column already exists
        
        try:
            cursor.execute('ALTER TABLE urls ADD COLUMN local_filename TEXT')
        except sqlite3.OperationalError:
            pass  # Column already exists
        
        conn.commit()
        conn.close()

    def download_all_images(self):
        """Main function to download all images"""
        self.logger.info("Starting image download process...")
        
        # Add additional columns if needed
        self.add_download_columns()
        
        total, downloaded, remaining = self.get_download_stats()
        self.logger.info(f"Total URLs: {total}, Downloaded: {downloaded}, Remaining: {remaining}")
        
        if remaining == 0:
            self.logger.info("All images have already been downloaded!")
            return
        
        consecutive_failures = 0
        max_consecutive_failures = 10
        
        while True:
            # Get next URL to download
            result = self.get_next_url_to_download()
            if not result:
                self.logger.info("No more URLs to download!")
                break
            
            url_id, url, original_filename = result
            
            # Generate filename
            filename = self.generate_filename(url, original_filename)
            
            # Download the image
            success = self.download_image(url, filename)
            
            # Mark as processed
            self.mark_as_downloaded(url_id, success)
            
            if success:
                consecutive_failures = 0
                # Update local filename in database
                conn = sqlite3.connect(self.db_path)
                cursor = conn.cursor()
                cursor.execute('UPDATE urls SET local_filename = ? WHERE id = ?', (filename, url_id))
                conn.commit()
                conn.close()
            else:
                consecutive_failures += 1
                if consecutive_failures >= max_consecutive_failures:
                    self.logger.error(f"Too many consecutive failures ({consecutive_failures}). Stopping download process.")
                    break
            
            # Show progress
            total, downloaded, remaining = self.get_download_stats()
            progress = (downloaded / total) * 100 if total > 0 else 0
            self.logger.info(f"Progress: {downloaded}/{total} ({progress:.1f}%) - Remaining: {remaining}")
            
            # Rate limiting
            if remaining > 0:  # Don't delay after the last download
                self.logger.info(f"Waiting {self.delay_seconds} seconds before next download...")
                time.sleep(self.delay_seconds)
        
        self.logger.info("Download process completed!")

    def resume_download(self):
        """Resume download from where it left off"""
        self.logger.info("Resuming download process...")
        self.download_all_images()

def main():
    """Main function"""
    db_path = '/var/www/vynex.org/urls.db'
    images_folder = '/var/www/vynex.org/images'
    delay_seconds = 3  # 3 seconds between downloads
    
    downloader = ImageDownloader(db_path, images_folder, delay_seconds)
    
    print("Image Downloader")
    print("===============")
    print(f"Database: {db_path}")
    print(f"Images folder: {images_folder}")
    print(f"Delay between downloads: {delay_seconds} seconds")
    print()
    
    try:
        downloader.download_all_images()
    except KeyboardInterrupt:
        print("\nDownload interrupted by user. Progress has been saved.")
        print("Run the script again to resume from where it left off.")
    except Exception as e:
        print(f"An error occurred: {e}")
        print("Check the log file for more details.")

if __name__ == "__main__":
    main()
