0% found this document useful (0 votes)
53 views4 pages

Keyword Search and Web Crawling Tool

The document is a Python script that crawls websites to search for specific keywords, utilizing the requests and BeautifulSoup libraries for web scraping. It includes functions for checking keywords on individual pages, crawling multiple domains, and processing results with error handling and logging. The script is designed to handle timeouts and irrelevant URLs while maintaining a session for efficiency.

Uploaded by

govindshukla2003
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
53 views4 pages

Keyword Search and Web Crawling Tool

The document is a Python script that crawls websites to search for specific keywords, utilizing the requests and BeautifulSoup libraries for web scraping. It includes functions for checking keywords on individual pages, crawling multiple domains, and processing results with error handling and logging. The script is designed to handle timeouts and irrelevant URLs while maintaining a session for efficiency.

Uploaded by

govindshukla2003
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd

import requests

from bs4 import BeautifulSoup


import re
from [Link] import urljoin, urlparse
import [Link]
import time
import logging

# Set up logging to capture potential issues.


[Link](level=[Link], format='%(asctime)s - %(levelname)s - %
(message)s')

def find_keywords_on_page(url, keywords, case_sensitive=False):


"""Checks if keywords are present on a single web page and returns found
keywords with URL.
Returns a dictionary of found keywords and their URLs. Returns None on
error."""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/[Link] Safari/537.36',
'Connection': 'keep-alive',
}
try:
session = [Link]()
response = [Link](url, timeout=10, headers=headers)
response.raise_for_status()
[Link] = response.apparent_encoding
try:
soup = BeautifulSoup([Link], '[Link]',
from_encoding=[Link])
except Exception as e:
[Link](f"Error parsing HTML with BeautifulSoup for {url}: {e}")
[Link]()
return None

# Check for language metadata here as well


if [Link]('html'):
lang = [Link]('html').get('lang', 'en')
if not [Link]('en'):
[Link]()
return None
text = soup.get_text(separator=' ', strip=True).lower() if not
case_sensitive else soup.get_text(separator=' ', strip=True)
found_keywords = {}
for keyword in keywords:
search_term = [Link]() if not case_sensitive else keyword
if [Link](r'\b' + [Link](search_term) + r'\b', text):
found_keywords[keyword] = url
[Link]()
return found_keywords
except [Link]:
return None
except Exception:
return None

def crawl_and_search(start_url, keywords, session, timeout=300):


"""Crawls a website and searches for keywords, stopping if any keyword appears
in the URL.
Returns a dictionary of found keywords and their URLs. Returns None on error.
Uses a shared session."""
visited = set()
queue = [start_url]
found_matches = {}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/[Link] Safari/537.36',
'Connection': 'keep-alive',
}
irrelevant_words = ["member", "provider"]
start_time = [Link]()
current_domain = urlparse(start_url).netloc

while queue:
current_url = [Link](0)
if current_url in visited:
continue
[Link](current_url)

# Check for timeout at the beginning of each iteration


if [Link]() - start_time > timeout:
print(f"Crawling of {current_domain} stopped due to timeout (>
{timeout} seconds).")
return None

# Check if any keyword is in the URL itself


for keyword in keywords:
if [Link]() in current_url.lower() if not case_sensitive else
keyword in current_url:
#print(f"Match found in URL: {current_url}")
return {keyword: current_url}

# Skip URLs containing irrelevant words


parsed_url = urlparse(current_url)
path_segments = [[Link]() for segment in parsed_url.[Link]('/')
if segment]
if any(word in path_segments for word in irrelevant_words):
#print(f"Skipping URL due to irrelevant words: {current_url}")
continue

try:
response = [Link](current_url, timeout=10, headers=headers)
response.raise_for_status()
[Link] = response.apparent_encoding

try:
soup = BeautifulSoup([Link], '[Link]',
from_encoding=[Link])
except Exception as e:
[Link](f"Error parsing HTML with BeautifulSoup for
{current_url}: {e}")
continue

# Check for language


if [Link]('html'):
lang = [Link]('html').get('lang', 'en')
if not [Link]('en'):
#print(f"Skipping non-English page: {current_url}")
continue # Skip non-English pages

links = [urljoin(current_url, link['href']) for link in


soup.find_all('a', href=True)]
page_keywords = find_keywords_on_page(current_url, keywords)
if page_keywords:
found_matches.update(page_keywords)
for absolute_url in links:
if absolute_url.startswith(start_url) and absolute_url not in
visited:
[Link](absolute_url)

except [Link] as e:
if [Link].status_code in [404, 403]:
#print(f"Skipping URL due to {[Link].status_code} error:
{current_url}")
continue # Skip this URL and continue with the next one
else:
#print(f"Error fetching URL {current_url}: {e}")
return None
except [Link] as e:
#print(f"Error fetching URL {current_url}: {e}")
return None
except Exception as e:
print(f"Error processing URL {current_url}: {e}")
pass
return found_matches

def process_domain(url, keywords, session, timeout=300):


"""Helper function to process a single domain.
Now takes a session and a timeout."""
start_time = [Link]()
result = crawl_and_search(url, keywords, session, timeout)
end_time = [Link]()
if result is None: # Check for None (timeout or other error)
return None
if end_time - start_time > timeout:
print(f"Crawling of {url} stopped due to timeout (> {timeout} seconds).")
return None
return result

def search_multiple_domains_from_file(filename, keywords, timeout=300):


"""Searches multiple domains from a text file for keywords using a thread
pool."""
try:
with open(filename, 'r') as file:
urls = [[Link]() for line in file if [Link]()]
with [Link](max_workers=4) as executor:
session = [Link]()
future_to_url = {[Link](process_domain, url, keywords,
session, timeout): url for url in urls}
for future in [Link].as_completed(future_to_url):
url = future_to_url[future]
try:
result = [Link]()
if result:
print(f"\nSearching: {url}")
print(f"Found matches in {url}:")
for keyword, match_url in [Link]():
print(f" Keyword: {keyword}, URL: {match_url}")
except Exception as e:
print(f"Error processing domain {url}: {e}")
pass
[Link]()

except FileNotFoundError:
print(f"Error: File '{filename}' not found.")

if __name__ == "__main__":
domains_file = "[Link]"
search_keywords = ["RFP", "Proposal", "Procurement", "Bid"]
case_sensitive = False
timeout_value = 300

start_time = [Link]()
search_multiple_domains_from_file(domains_file, search_keywords,
timeout=timeout_value)
end_time = [Link]()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

You might also like