import requests
from bs4 import BeautifulSoup
import re
from [Link] import urljoin, urlparse
import [Link]
import time
import logging
# Set up logging to capture potential issues.
[Link](level=[Link], format='%(asctime)s - %(levelname)s - %
(message)s')
def find_keywords_on_page(url, keywords, case_sensitive=False):
"""Checks if keywords are present on a single web page and returns found
keywords with URL.
Returns a dictionary of found keywords and their URLs. Returns None on
error."""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/[Link] Safari/537.36',
'Connection': 'keep-alive',
}
try:
session = [Link]()
response = [Link](url, timeout=10, headers=headers)
response.raise_for_status()
[Link] = response.apparent_encoding
try:
soup = BeautifulSoup([Link], '[Link]',
from_encoding=[Link])
except Exception as e:
[Link](f"Error parsing HTML with BeautifulSoup for {url}: {e}")
[Link]()
return None
# Check for language metadata here as well
if [Link]('html'):
lang = [Link]('html').get('lang', 'en')
if not [Link]('en'):
[Link]()
return None
text = soup.get_text(separator=' ', strip=True).lower() if not
case_sensitive else soup.get_text(separator=' ', strip=True)
found_keywords = {}
for keyword in keywords:
search_term = [Link]() if not case_sensitive else keyword
if [Link](r'\b' + [Link](search_term) + r'\b', text):
found_keywords[keyword] = url
[Link]()
return found_keywords
except [Link]:
return None
except Exception:
return None
def crawl_and_search(start_url, keywords, session, timeout=300):
"""Crawls a website and searches for keywords, stopping if any keyword appears
in the URL.
Returns a dictionary of found keywords and their URLs. Returns None on error.
Uses a shared session."""
visited = set()
queue = [start_url]
found_matches = {}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/[Link] Safari/537.36',
'Connection': 'keep-alive',
}
irrelevant_words = ["member", "provider"]
start_time = [Link]()
current_domain = urlparse(start_url).netloc
while queue:
current_url = [Link](0)
if current_url in visited:
continue
[Link](current_url)
# Check for timeout at the beginning of each iteration
if [Link]() - start_time > timeout:
print(f"Crawling of {current_domain} stopped due to timeout (>
{timeout} seconds).")
return None
# Check if any keyword is in the URL itself
for keyword in keywords:
if [Link]() in current_url.lower() if not case_sensitive else
keyword in current_url:
#print(f"Match found in URL: {current_url}")
return {keyword: current_url}
# Skip URLs containing irrelevant words
parsed_url = urlparse(current_url)
path_segments = [[Link]() for segment in parsed_url.[Link]('/')
if segment]
if any(word in path_segments for word in irrelevant_words):
#print(f"Skipping URL due to irrelevant words: {current_url}")
continue
try:
response = [Link](current_url, timeout=10, headers=headers)
response.raise_for_status()
[Link] = response.apparent_encoding
try:
soup = BeautifulSoup([Link], '[Link]',
from_encoding=[Link])
except Exception as e:
[Link](f"Error parsing HTML with BeautifulSoup for
{current_url}: {e}")
continue
# Check for language
if [Link]('html'):
lang = [Link]('html').get('lang', 'en')
if not [Link]('en'):
#print(f"Skipping non-English page: {current_url}")
continue # Skip non-English pages
links = [urljoin(current_url, link['href']) for link in
soup.find_all('a', href=True)]
page_keywords = find_keywords_on_page(current_url, keywords)
if page_keywords:
found_matches.update(page_keywords)
for absolute_url in links:
if absolute_url.startswith(start_url) and absolute_url not in
visited:
[Link](absolute_url)
except [Link] as e:
if [Link].status_code in [404, 403]:
#print(f"Skipping URL due to {[Link].status_code} error:
{current_url}")
continue # Skip this URL and continue with the next one
else:
#print(f"Error fetching URL {current_url}: {e}")
return None
except [Link] as e:
#print(f"Error fetching URL {current_url}: {e}")
return None
except Exception as e:
print(f"Error processing URL {current_url}: {e}")
pass
return found_matches
def process_domain(url, keywords, session, timeout=300):
"""Helper function to process a single domain.
Now takes a session and a timeout."""
start_time = [Link]()
result = crawl_and_search(url, keywords, session, timeout)
end_time = [Link]()
if result is None: # Check for None (timeout or other error)
return None
if end_time - start_time > timeout:
print(f"Crawling of {url} stopped due to timeout (> {timeout} seconds).")
return None
return result
def search_multiple_domains_from_file(filename, keywords, timeout=300):
"""Searches multiple domains from a text file for keywords using a thread
pool."""
try:
with open(filename, 'r') as file:
urls = [[Link]() for line in file if [Link]()]
with [Link](max_workers=4) as executor:
session = [Link]()
future_to_url = {[Link](process_domain, url, keywords,
session, timeout): url for url in urls}
for future in [Link].as_completed(future_to_url):
url = future_to_url[future]
try:
result = [Link]()
if result:
print(f"\nSearching: {url}")
print(f"Found matches in {url}:")
for keyword, match_url in [Link]():
print(f" Keyword: {keyword}, URL: {match_url}")
except Exception as e:
print(f"Error processing domain {url}: {e}")
pass
[Link]()
except FileNotFoundError:
print(f"Error: File '{filename}' not found.")
if __name__ == "__main__":
domains_file = "[Link]"
search_keywords = ["RFP", "Proposal", "Procurement", "Bid"]
case_sensitive = False
timeout_value = 300
start_time = [Link]()
search_multiple_domains_from_file(domains_file, search_keywords,
timeout=timeout_value)
end_time = [Link]()
print(f"Total time taken: {end_time - start_time:.2f} seconds")