diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/README.md diff --git a/main.py b/main.py new file mode 100644 index 0000000..cf8836e --- /dev/null +++ b/main.py @@ -0,0 +1,181 @@ +import requests +from bs4 import BeautifulSoup +import json +import argparse +from rich.console import Console +from rich.markdown import Markdown + +def duckduckgo_search(query, num_results=5): + # Construct the DuckDuckGo URL for the search query + url = f"https://html.duckduckgo.com/html/?q={query}" + + # Send a GET request to the DuckDuckGo search page + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' + } + + response = requests.get(url, headers=headers) + + # Check if the request was successful + if response.status_code != 200: + print(f"Failed to retrieve search results. Status code: {response.status_code}") + return [] + + # Parse the HTML content using BeautifulSoup + soup = BeautifulSoup(response.content, 'html.parser') + + # Find all result links (assuming they are in tags with class "result__a") + result_links = [] + for a_tag in soup.find_all('a', class_='result__a'): + link = a_tag.get('href') + if link: + result_links.append(link) + if len(result_links) >= num_results: + break + + return result_links + + +def extract_text_from_links(links): + extracted_texts = [] + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' + } + + for link in links: + try: + response = requests.get(link, headers=headers) + if response.status_code == 200: + soup = BeautifulSoup(response.content, 'html.parser') + # Extract text from the page + text = soup.get_text(separator='\n', strip=True) + extracted_texts.append((link, text)) + else: + print(f"Failed to retrieve content from {link}. Status code: {response.status_code}") + except requests.RequestException as e: + print(f"An error occurred while fetching {link}: {e}") + + return extracted_texts + + +def summarize_individual_texts(texts_and_urls, query, ollama_url="http://localhost:11434/api/generate"): + summaries = [] + for url, text in texts_and_urls: + prompt = f"Extract the relevant information from the following text with regards to the original \ + query: '{query}'\n\n{text}\n" + payload = { + "model": "command-r", + "prompt": prompt, + "stream": False, + "max_tokens": 1000 + } + + try: + response = requests.post(ollama_url, json=payload) + if response.status_code == 200: + result = json.loads(response.text)["response"] + summaries.append((url, result)) + else: + print(f"Failed to get summary from Ollama server for {url}. Status code: {response.status_code}") + except requests.RequestException as e: + print(f"An error occurred while sending request to Ollama server for {url}: {e}") + + return summaries + + +def summarize_with_ollama(texts_and_urls, query, ollama_url="http://localhost:11434/api/generate"): + # Prepare the context and prompt + context = "\n".join([f"URL: {url}\nText: {text}" for url, text in texts_and_urls]) + prompt = f"Summarize the following search results with regards to the original query: '{query}' \ + and include the full URLs as references where appropriate. Use markdown to format your response and unicode characters. \ + \n\n{context}" + + # Create the payload for the POST request + payload = { + "model": "command-r", + "prompt": prompt, + "stream": False, + "max_tokens": 1500 + } + + # Send the POST request to the Ollama server + try: + print("Processing") + response = requests.post(ollama_url, json=payload) + if response.status_code == 200: + result = json.loads(response.text)["response"] + return result + else: + print(f"Failed to get summary from Ollama server. Status code: {response.status_code}") + return None + except requests.RequestException as e: + print(f"An error occurred while sending request to Ollama server: {e}") + return None + + +def optimize_search_query(query, ollama_url="http://localhost:11434/api/generate"): + # Prepare the prompt for optimizing the search query + prompt = f"Optimize the following natural language query to improve its effectiveness in a web search.\ + Make it very concise. query: '{query}'" + + # Create the payload for the POST request + payload = { + "model": "command-r", + "prompt": prompt, + "stream": False, + "max_tokens": 50 + } + + # Send the POST request to the Ollama server + try: + print("Optimizing search query") + response = requests.post(ollama_url, json=payload) + if response.status_code == 200: + result = json.loads(response.text)["response"].strip() + return result.strip('"') + else: + print(f"Failed to optimize search query from Ollama server. Status code: {response.status_code}") + return query + except requests.RequestException as e: + print(f"An error occurred while sending request to Ollama server for optimizing the search query: {e}") + return query + + +def pretty_print_markdown(markdown_text): + console = Console() + md = Markdown(markdown_text) + console.print(md) + + +if __name__ == "__main__": + # Set up argument parser + parser = argparse.ArgumentParser(description="Search DuckDuckGo, extract text from results, and summarize with Ollama.") + parser.add_argument("query", type=str, help="The search query to use on DuckDuckGo") + parser.add_argument("--num_results", type=int, default=5, help="Number of search results to process (default: 5)") + + # Parse arguments + args = parser.parse_args() + + original_query = args.query + # Optimize the search query + optimized_query = optimize_search_query(original_query) + print(f"Original Query: {original_query}") + print(f"Optimized Query: {optimized_query}") + + n = args.num_results # Number of results to extract + links = duckduckgo_search(optimized_query, n) + + print(f"Top {n} search results:") + for i, link in enumerate(links, start=1): + print(f"{i}. {link}") + + texts_and_urls = extract_text_from_links(links) + + print("Summarizing individual search results") + intermediate_summaries = summarize_individual_texts(texts_and_urls, original_query) + + final_summary = summarize_with_ollama(intermediate_summaries, original_query) + + if final_summary: + print("\nFinal Summary of search results:\n") + pretty_print_markdown(final_summary) \ No newline at end of file