334 lines
9.3 KiB
Python
334 lines
9.3 KiB
Python
# TODO: Switch to s different search provider
|
|
|
|
from typing import Any, Dict, Optional
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from core.headers import get_useragent
|
|
|
|
|
|
class MullvadLetaWrapper:
|
|
"""Wrapper for Mullvad Leta privacy-focused search engine."""
|
|
|
|
BASE_URL = "https://leta.mullvad.net/search"
|
|
|
|
# Available search engines
|
|
ENGINES = ["brave", "google"]
|
|
|
|
# Available countries (from the HTML)
|
|
COUNTRIES = [
|
|
"ar",
|
|
"au",
|
|
"at",
|
|
"be",
|
|
"br",
|
|
"ca",
|
|
"cl",
|
|
"cn",
|
|
"dk",
|
|
"fi",
|
|
"fr",
|
|
"de",
|
|
"hk",
|
|
"in",
|
|
"id",
|
|
"it",
|
|
"jp",
|
|
"kr",
|
|
"my",
|
|
"mx",
|
|
"nl",
|
|
"nz",
|
|
"no",
|
|
"ph",
|
|
"pl",
|
|
"pt",
|
|
"ru",
|
|
"sa",
|
|
"za",
|
|
"es",
|
|
"se",
|
|
"ch",
|
|
"tw",
|
|
"tr",
|
|
"uk",
|
|
"us",
|
|
]
|
|
|
|
# Available languages
|
|
LANGUAGES = [
|
|
"ar",
|
|
"bg",
|
|
"ca",
|
|
"zh-hans",
|
|
"zh-hant",
|
|
"hr",
|
|
"cs",
|
|
"da",
|
|
"nl",
|
|
"en",
|
|
"et",
|
|
"fi",
|
|
"fr",
|
|
"de",
|
|
"he",
|
|
"hu",
|
|
"is",
|
|
"it",
|
|
"jp",
|
|
"ko",
|
|
"lv",
|
|
"lt",
|
|
"nb",
|
|
"pl",
|
|
"pt",
|
|
"ro",
|
|
"ru",
|
|
"sr",
|
|
"sk",
|
|
"sl",
|
|
"es",
|
|
"sv",
|
|
"tr",
|
|
]
|
|
|
|
# Time filters
|
|
TIME_FILTERS = ["d", "w", "m", "y"] # day, week, month, year
|
|
|
|
def __init__(self, engine: str = "brave"):
|
|
"""
|
|
Initialize the Mullvad Leta wrapper.
|
|
|
|
Args:
|
|
engine: Search engine to use ("brave" or "google")
|
|
"""
|
|
if engine not in self.ENGINES:
|
|
raise ValueError(f"Engine must be one of {self.ENGINES}")
|
|
|
|
self.engine = engine
|
|
self.session = requests.Session()
|
|
|
|
def _get_headers(self) -> Dict[str, str]:
|
|
"""Get request headers with user agent."""
|
|
return {
|
|
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
|
|
"cache-control": "max-age=0",
|
|
"sec-ch-ua": '"Chromium";v="140", "Not=A?Brand";v="24"',
|
|
"sec-ch-ua-mobile": "?0",
|
|
"sec-ch-ua-platform": '"Linux"',
|
|
"sec-fetch-dest": "document",
|
|
"sec-fetch-mode": "navigate",
|
|
"sec-fetch-site": "same-origin",
|
|
"sec-fetch-user": "?1",
|
|
"upgrade-insecure-requests": "1",
|
|
"user-agent": get_useragent(),
|
|
}
|
|
|
|
def search(
|
|
self,
|
|
query: str,
|
|
country: Optional[str] = None,
|
|
language: Optional[str] = None,
|
|
last_updated: Optional[str] = None,
|
|
page: int = 1,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Perform a search on Mullvad Leta.
|
|
|
|
Args:
|
|
query: Search query string
|
|
country: Country code filter (e.g., "us", "uk")
|
|
language: Language code filter (e.g., "en", "fr")
|
|
last_updated: Time filter ("d", "w", "m", "y")
|
|
page: Page number (default: 1)
|
|
|
|
Returns:
|
|
Dictionary containing search results and metadata
|
|
"""
|
|
if country and country not in self.COUNTRIES:
|
|
raise ValueError(f"Invalid country code. Must be one of {self.COUNTRIES}")
|
|
|
|
if language and language not in self.LANGUAGES:
|
|
raise ValueError(f"Invalid language code. Must be one of {self.LANGUAGES}")
|
|
|
|
if last_updated and last_updated not in self.TIME_FILTERS:
|
|
raise ValueError(f"Invalid time filter. Must be one of {self.TIME_FILTERS}")
|
|
|
|
# Build query parameters
|
|
params = {"q": query, "engine": self.engine}
|
|
|
|
if country:
|
|
params["country"] = country
|
|
if language:
|
|
params["language"] = language
|
|
if last_updated:
|
|
params["lastUpdated"] = last_updated
|
|
if page > 1:
|
|
params["page"] = str(page)
|
|
|
|
# Set cookie for engine preference
|
|
cookies = {"engine": self.engine}
|
|
|
|
# Make request
|
|
response = self.session.get(
|
|
self.BASE_URL,
|
|
params=params,
|
|
headers=self._get_headers(),
|
|
cookies=cookies,
|
|
timeout=10,
|
|
)
|
|
response.raise_for_status()
|
|
|
|
# Parse results
|
|
return self._parse_results(response.text, query, page)
|
|
|
|
def _parse_results(self, html: str, query: str, page: int) -> Dict[str, Any]:
|
|
"""
|
|
Parse HTML response and extract search results.
|
|
|
|
Args:
|
|
html: HTML response content
|
|
query: Original search query
|
|
page: Current page number
|
|
|
|
Returns:
|
|
Dictionary containing parsed results
|
|
"""
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
results = {
|
|
"query": query,
|
|
"page": page,
|
|
"engine": self.engine,
|
|
"results": [],
|
|
"infobox": None,
|
|
"news": [],
|
|
"cached": False,
|
|
}
|
|
|
|
# Check if cached
|
|
cache_notice = soup.find("p", class_="small")
|
|
if cache_notice and "cached" in cache_notice.text.lower():
|
|
results["cached"] = True
|
|
|
|
# Extract regular search results
|
|
articles = soup.find_all("article", class_="svelte-fmlk7p")
|
|
for article in articles:
|
|
result = self._parse_article(article)
|
|
if result:
|
|
results["results"].append(result)
|
|
|
|
# Extract infobox if present
|
|
infobox_div = soup.find("div", class_="infobox")
|
|
if infobox_div:
|
|
results["infobox"] = self._parse_infobox(infobox_div)
|
|
|
|
# Extract news results
|
|
news_div = soup.find("div", class_="news")
|
|
if news_div:
|
|
news_articles = news_div.find_all("article")
|
|
for article in news_articles:
|
|
news_item = self._parse_news_article(article)
|
|
if news_item:
|
|
results["news"].append(news_item)
|
|
|
|
# Check for next page
|
|
next_button = soup.find("button", {"data-cy": "next-button"})
|
|
results["has_next_page"] = next_button is not None
|
|
|
|
return results
|
|
|
|
def _parse_article(self, article) -> Optional[Dict[str, str]]:
|
|
"""Parse a single search result article."""
|
|
try:
|
|
link_tag = article.find("a", href=True)
|
|
if not link_tag:
|
|
return None
|
|
|
|
title_tag = article.find("h3")
|
|
snippet_tag = article.find("p", class_="result__body")
|
|
cite_tag = article.find("cite")
|
|
|
|
return {
|
|
"url": link_tag["href"],
|
|
"title": title_tag.get_text(strip=True) if title_tag else "",
|
|
"snippet": snippet_tag.get_text(strip=True) if snippet_tag else "",
|
|
"display_url": cite_tag.get_text(strip=True) if cite_tag else "",
|
|
}
|
|
except Exception as e:
|
|
print(f"Error parsing article: {e}")
|
|
return None
|
|
|
|
def _parse_infobox(self, infobox_div) -> Dict[str, Any]:
|
|
"""Parse infobox information."""
|
|
infobox = {}
|
|
|
|
title_tag = infobox_div.find("h1")
|
|
if title_tag:
|
|
infobox["title"] = title_tag.get_text(strip=True)
|
|
|
|
subtitle_tag = infobox_div.find("h2")
|
|
if subtitle_tag:
|
|
infobox["subtitle"] = subtitle_tag.get_text(strip=True)
|
|
|
|
url_tag = infobox_div.find("a", rel="noreferrer")
|
|
if url_tag:
|
|
infobox["url"] = url_tag["href"]
|
|
|
|
desc_tag = infobox_div.find("p")
|
|
if desc_tag:
|
|
infobox["description"] = desc_tag.get_text(strip=True)
|
|
|
|
return infobox
|
|
|
|
def _parse_news_article(self, article) -> Optional[Dict[str, str]]:
|
|
"""Parse a news article."""
|
|
try:
|
|
link_tag = article.find("a", href=True)
|
|
if not link_tag:
|
|
return None
|
|
|
|
title_tag = link_tag.find("h3")
|
|
cite_tag = link_tag.find("cite")
|
|
time_tag = link_tag.find("time")
|
|
|
|
return {
|
|
"url": link_tag["href"],
|
|
"title": title_tag.get_text(strip=True) if title_tag else "",
|
|
"source": cite_tag.get_text(strip=True) if cite_tag else "",
|
|
"timestamp": time_tag["datetime"]
|
|
if time_tag and time_tag.has_attr("datetime")
|
|
else "",
|
|
}
|
|
except Exception as e:
|
|
print(f"Error parsing news article: {e}")
|
|
return None
|
|
|
|
|
|
# Example usage
|
|
if __name__ == "__main__":
|
|
# Create wrapper instance
|
|
leta = MullvadLetaWrapper(engine="brave")
|
|
|
|
# Perform a search
|
|
results = leta.search("python programming", country="us", language="en")
|
|
|
|
# Display results
|
|
print(f"Query: {results['query']}")
|
|
print(f"Engine: {results['engine']}")
|
|
print(f"Cached: {results['cached']}")
|
|
print(f"\nFound {len(results['results'])} results:\n")
|
|
|
|
for i, result in enumerate(results["results"][:5], 1):
|
|
print(f"{i}. {result['title']}")
|
|
print(f" URL: {result['url']}")
|
|
print(f" {result['snippet'][:100]}...\n")
|
|
|
|
if results["news"]:
|
|
print(f"\nNews ({len(results['news'])} items):")
|
|
for news in results["news"][:3]:
|
|
print(f"- {news['title']}")
|
|
print(f" {news['source']}\n")
|