import threading
import requests
from upstash_redis import Redis
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Initialize Redis client
redis = Redis.from_env()
# Group URLs by thread, with one or two overlapping URLs across groups
urls_to_scrape_groups = [
    [
        'https://httpbin.org/delay/1',
        'https://httpbin.org/delay/4',
        'https://httpbin.org/delay/2',
        'https://httpbin.org/delay/5',
        'https://httpbin.org/delay/3',
    ],
    [
        'https://httpbin.org/delay/5',  # Overlapping URL
        'https://httpbin.org/delay/6',
        'https://httpbin.org/delay/7',
        'https://httpbin.org/delay/2',  # Overlapping URL
        'https://httpbin.org/delay/8',
    ],
    [
        'https://httpbin.org/delay/3',  # Overlapping URL
        'https://httpbin.org/delay/9',
        'https://httpbin.org/delay/10',
        'https://httpbin.org/delay/4',  # Overlapping URL
        'https://httpbin.org/delay/11',
    ],
]
class Scraper(threading.Thread):
    def __init__(self, urls):
        threading.Thread.__init__(self)
        self.urls = urls
        self.results = {}
    def run(self):
        for url in self.urls:
            cache_key = f"url:{url}"
            
            # Attempt to retrieve cached response
            cached_response = redis.get(cache_key)
            
            if cached_response:
                print(f"[CACHE HIT] {self.name} - URL: {url}")
                self.results[url] = cached_response
                continue  # Skip to the next URL if cache is found
            
            # If no cache, perform the HTTP request
            print(f"[FETCHING] {self.name} - URL: {url}")
            response = requests.get(url)
            if response.status_code == 200:
                self.results[url] = response.text
                # Store the response in Redis cache
                redis.set(cache_key, response.text)
            else:
                print(f"[ERROR] {self.name} - Failed to retrieve {url}")
                self.results[url] = None
def main():
    threads = []
    for urls in urls_to_scrape_groups:
        scraper = Scraper(urls)
        threads.append(scraper)
        scraper.start()
    # Wait for all threads to complete
    for scraper in threads:
        scraper.join()
    print("\nScraping results:")
    for scraper in threads:
        for url, result in scraper.results.items():
            print(f"Thread {scraper.name} - URL: {url} - Response Length: {len(result) if result else 'Failed'}")
if __name__ == "__main__":
    main()