Few small different syntex fixes and so on

2025-11-10 11:32:33 +01:00
parent e38b1b3052
commit d4ff7feb17
6 changed files with 757 additions and 408 deletions
--- a/core/web_search.py
+++ b/core/web_search.py
@@ -1,50 +1,114 @@
+# TODO: Switch to s different search provider
+
+from typing import Any, Dict, Optional
+
 import requests
-from typing import Optional, Dict, List, Any
-from urllib.parse import urlencode
 from bs4 import BeautifulSoup
+
 from core.headers import get_useragent


 class MullvadLetaWrapper:
    """Wrapper for Mullvad Leta privacy-focused search engine."""
-    
+
    BASE_URL = "https://leta.mullvad.net/search"
-    
+
    # Available search engines
    ENGINES = ["brave", "google"]
-    
+
    # Available countries (from the HTML)
    COUNTRIES = [
-        "ar", "au", "at", "be", "br", "ca", "cl", "cn", "dk", "fi", 
-        "fr", "de", "hk", "in", "id", "it", "jp", "kr", "my", "mx",
-        "nl", "nz", "no", "ph", "pl", "pt", "ru", "sa", "za", "es",
-        "se", "ch", "tw", "tr", "uk", "us"
+        "ar",
+        "au",
+        "at",
+        "be",
+        "br",
+        "ca",
+        "cl",
+        "cn",
+        "dk",
+        "fi",
+        "fr",
+        "de",
+        "hk",
+        "in",
+        "id",
+        "it",
+        "jp",
+        "kr",
+        "my",
+        "mx",
+        "nl",
+        "nz",
+        "no",
+        "ph",
+        "pl",
+        "pt",
+        "ru",
+        "sa",
+        "za",
+        "es",
+        "se",
+        "ch",
+        "tw",
+        "tr",
+        "uk",
+        "us",
    ]
-    
+
    # Available languages
    LANGUAGES = [
-        "ar", "bg", "ca", "zh-hans", "zh-hant", "hr", "cs", "da", "nl",
-        "en", "et", "fi", "fr", "de", "he", "hu", "is", "it", "jp",
-        "ko", "lv", "lt", "nb", "pl", "pt", "ro", "ru", "sr", "sk",
-        "sl", "es", "sv", "tr"
+        "ar",
+        "bg",
+        "ca",
+        "zh-hans",
+        "zh-hant",
+        "hr",
+        "cs",
+        "da",
+        "nl",
+        "en",
+        "et",
+        "fi",
+        "fr",
+        "de",
+        "he",
+        "hu",
+        "is",
+        "it",
+        "jp",
+        "ko",
+        "lv",
+        "lt",
+        "nb",
+        "pl",
+        "pt",
+        "ro",
+        "ru",
+        "sr",
+        "sk",
+        "sl",
+        "es",
+        "sv",
+        "tr",
    ]
-    
+
    # Time filters
    TIME_FILTERS = ["d", "w", "m", "y"]  # day, week, month, year
-    
+
    def __init__(self, engine: str = "brave"):
        """
        Initialize the Mullvad Leta wrapper.
-        
+
        Args:
            engine: Search engine to use ("brave" or "google")
        """
        if engine not in self.ENGINES:
            raise ValueError(f"Engine must be one of {self.ENGINES}")
-        
+
        self.engine = engine
        self.session = requests.Session()
-        
+
    def _get_headers(self) -> Dict[str, str]:
        """Get request headers with user agent."""
        return {
@@ -59,45 +123,42 @@ class MullvadLetaWrapper:
            "sec-fetch-site": "same-origin",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
-            "user-agent": get_useragent()
+            "user-agent": get_useragent(),
        }
-    
+
    def search(
        self,
        query: str,
        country: Optional[str] = None,
        language: Optional[str] = None,
        last_updated: Optional[str] = None,
-        page: int = 1
+        page: int = 1,
    ) -> Dict[str, Any]:
        """
        Perform a search on Mullvad Leta.
-        
+
        Args:
            query: Search query string
            country: Country code filter (e.g., "us", "uk")
            language: Language code filter (e.g., "en", "fr")
            last_updated: Time filter ("d", "w", "m", "y")
            page: Page number (default: 1)
-            
+
        Returns:
            Dictionary containing search results and metadata
        """
        if country and country not in self.COUNTRIES:
            raise ValueError(f"Invalid country code. Must be one of {self.COUNTRIES}")
-        
+
        if language and language not in self.LANGUAGES:
            raise ValueError(f"Invalid language code. Must be one of {self.LANGUAGES}")
-        
+
        if last_updated and last_updated not in self.TIME_FILTERS:
            raise ValueError(f"Invalid time filter. Must be one of {self.TIME_FILTERS}")
-        
+
        # Build query parameters
-        params = {
-            "q": query,
-            "engine": self.engine
-        }
-        
+        params = {"q": query, "engine": self.engine}
+
        if country:
            params["country"] = country
        if language:
@@ -106,37 +167,37 @@ class MullvadLetaWrapper:
            params["lastUpdated"] = last_updated
        if page > 1:
            params["page"] = str(page)
-        
+
        # Set cookie for engine preference
        cookies = {"engine": self.engine}
-        
+
        # Make request
        response = self.session.get(
            self.BASE_URL,
            params=params,
            headers=self._get_headers(),
            cookies=cookies,
-            timeout=10
+            timeout=10,
        )
        response.raise_for_status()
-        
+
        # Parse results
        return self._parse_results(response.text, query, page)
-    
+
    def _parse_results(self, html: str, query: str, page: int) -> Dict[str, Any]:
        """
        Parse HTML response and extract search results.
-        
+
        Args:
            html: HTML response content
            query: Original search query
            page: Current page number
-            
+
        Returns:
            Dictionary containing parsed results
        """
-        soup = BeautifulSoup(html, 'html.parser')
-        
+        soup = BeautifulSoup(html, "html.parser")
+
        results = {
            "query": query,
            "page": page,
@@ -144,100 +205,102 @@ class MullvadLetaWrapper:
            "results": [],
            "infobox": None,
            "news": [],
-            "cached": False
+            "cached": False,
        }
-        
+
        # Check if cached
-        cache_notice = soup.find('p', class_='small')
-        if cache_notice and 'cached' in cache_notice.text.lower():
+        cache_notice = soup.find("p", class_="small")
+        if cache_notice and "cached" in cache_notice.text.lower():
            results["cached"] = True
-        
+
        # Extract regular search results
-        articles = soup.find_all('article', class_='svelte-fmlk7p')
+        articles = soup.find_all("article", class_="svelte-fmlk7p")
        for article in articles:
            result = self._parse_article(article)
            if result:
                results["results"].append(result)
-        
+
        # Extract infobox if present
-        infobox_div = soup.find('div', class_='infobox')
+        infobox_div = soup.find("div", class_="infobox")
        if infobox_div:
            results["infobox"] = self._parse_infobox(infobox_div)
-        
+
        # Extract news results
-        news_div = soup.find('div', class_='news')
+        news_div = soup.find("div", class_="news")
        if news_div:
-            news_articles = news_div.find_all('article')
+            news_articles = news_div.find_all("article")
            for article in news_articles:
                news_item = self._parse_news_article(article)
                if news_item:
                    results["news"].append(news_item)
-        
+
        # Check for next page
-        next_button = soup.find('button', {'data-cy': 'next-button'})
+        next_button = soup.find("button", {"data-cy": "next-button"})
        results["has_next_page"] = next_button is not None
-        
+
        return results
-    
+
    def _parse_article(self, article) -> Optional[Dict[str, str]]:
        """Parse a single search result article."""
        try:
-            link_tag = article.find('a', href=True)
+            link_tag = article.find("a", href=True)
            if not link_tag:
                return None
-            
-            title_tag = article.find('h3')
-            snippet_tag = article.find('p', class_='result__body')
-            cite_tag = article.find('cite')
-            
+
+            title_tag = article.find("h3")
+            snippet_tag = article.find("p", class_="result__body")
+            cite_tag = article.find("cite")
+
            return {
-                "url": link_tag['href'],
+                "url": link_tag["href"],
                "title": title_tag.get_text(strip=True) if title_tag else "",
                "snippet": snippet_tag.get_text(strip=True) if snippet_tag else "",
-                "display_url": cite_tag.get_text(strip=True) if cite_tag else ""
+                "display_url": cite_tag.get_text(strip=True) if cite_tag else "",
            }
        except Exception as e:
            print(f"Error parsing article: {e}")
            return None
-    
+
    def _parse_infobox(self, infobox_div) -> Dict[str, Any]:
        """Parse infobox information."""
        infobox = {}
-        
-        title_tag = infobox_div.find('h1')
+
+        title_tag = infobox_div.find("h1")
        if title_tag:
            infobox["title"] = title_tag.get_text(strip=True)
-        
-        subtitle_tag = infobox_div.find('h2')
+
+        subtitle_tag = infobox_div.find("h2")
        if subtitle_tag:
            infobox["subtitle"] = subtitle_tag.get_text(strip=True)
-        
-        url_tag = infobox_div.find('a', rel='noreferrer')
+
+        url_tag = infobox_div.find("a", rel="noreferrer")
        if url_tag:
-            infobox["url"] = url_tag['href']
-        
-        desc_tag = infobox_div.find('p')
+            infobox["url"] = url_tag["href"]
+
+        desc_tag = infobox_div.find("p")
        if desc_tag:
            infobox["description"] = desc_tag.get_text(strip=True)
-        
+
        return infobox
-    
+
    def _parse_news_article(self, article) -> Optional[Dict[str, str]]:
        """Parse a news article."""
        try:
-            link_tag = article.find('a', href=True)
+            link_tag = article.find("a", href=True)
            if not link_tag:
                return None
-            
-            title_tag = link_tag.find('h3')
-            cite_tag = link_tag.find('cite')
-            time_tag = link_tag.find('time')
-            
+
+            title_tag = link_tag.find("h3")
+            cite_tag = link_tag.find("cite")
+            time_tag = link_tag.find("time")
+
            return {
-                "url": link_tag['href'],
+                "url": link_tag["href"],
                "title": title_tag.get_text(strip=True) if title_tag else "",
                "source": cite_tag.get_text(strip=True) if cite_tag else "",
-                "timestamp": time_tag['datetime'] if time_tag and time_tag.has_attr('datetime') else ""
+                "timestamp": time_tag["datetime"]
+                if time_tag and time_tag.has_attr("datetime")
+                else "",
            }
        except Exception as e:
            print(f"Error parsing news article: {e}")
@@ -248,23 +311,23 @@ class MullvadLetaWrapper:
 if __name__ == "__main__":
    # Create wrapper instance
    leta = MullvadLetaWrapper(engine="brave")
-    
+
    # Perform a search
    results = leta.search("python programming", country="us", language="en")
-    
+
    # Display results
    print(f"Query: {results['query']}")
    print(f"Engine: {results['engine']}")
    print(f"Cached: {results['cached']}")
    print(f"\nFound {len(results['results'])} results:\n")
-    
-    for i, result in enumerate(results['results'][:5], 1):
+
+    for i, result in enumerate(results["results"][:5], 1):
        print(f"{i}. {result['title']}")
        print(f"   URL: {result['url']}")
        print(f"   {result['snippet'][:100]}...\n")
-    
-    if results['news']:
+
+    if results["news"]:
        print(f"\nNews ({len(results['news'])} items):")
-        for news in results['news'][:3]:
+        for news in results["news"][:3]:
            print(f"- {news['title']}")
-            print(f"  {news['source']}\n")
+            print(f"  {news['source']}\n")