Keyword Search and Web Crawling Tool

The document is a Python script that crawls websites to search for specific keywords, utilizing the requests and BeautifulSoup libraries for web scraping. It includes functions for checking keywords on individual pages, crawling multiple domains, and processing results with error handling and logging. The script is designed to handle timeouts and irrelevant URLs while maintaining a session for efficiency.

Uploaded by

govindshukla2003

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

53 views4 pages

Keyword Search and Web Crawling Tool

Uploaded by

govindshukla2003

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

import requests

from bs4 import BeautifulSoup

import re
from [Link] import urljoin, urlparse
import [Link]
import time
import logging

# Set up logging to capture potential issues.

[Link](level=[Link], format='%(asctime)s - %(levelname)s - %
(message)s')

def find_keywords_on_page(url, keywords, case_sensitive=False):

"""Checks if keywords are present on a single web page and returns found
keywords with URL.
Returns a dictionary of found keywords and their URLs. Returns None on
error."""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/[Link] Safari/537.36',
'Connection': 'keep-alive',
}
try:
session = [Link]()
response = [Link](url, timeout=10, headers=headers)
response.raise_for_status()
[Link] = response.apparent_encoding
try:
soup = BeautifulSoup([Link], '[Link]',
from_encoding=[Link])
except Exception as e:
[Link](f"Error parsing HTML with BeautifulSoup for {url}: {e}")
[Link]()
return None

# Check for language metadata here as well

if [Link]('html'):
lang = [Link]('html').get('lang', 'en')
if not [Link]('en'):
[Link]()
return None
text = soup.get_text(separator=' ', strip=True).lower() if not
case_sensitive else soup.get_text(separator=' ', strip=True)
found_keywords = {}
for keyword in keywords:
search_term = [Link]() if not case_sensitive else keyword
if [Link](r'\b' + [Link](search_term) + r'\b', text):
found_keywords[keyword] = url
[Link]()
return found_keywords
except [Link]:
return None
except Exception:
return None

def crawl_and_search(start_url, keywords, session, timeout=300):

"""Crawls a website and searches for keywords, stopping if any keyword appears
in the URL.
Returns a dictionary of found keywords and their URLs. Returns None on error.
Uses a shared session."""
visited = set()
queue = [start_url]
found_matches = {}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/[Link] Safari/537.36',
'Connection': 'keep-alive',
}
irrelevant_words = ["member", "provider"]
start_time = [Link]()
current_domain = urlparse(start_url).netloc

while queue:
current_url = [Link](0)
if current_url in visited:
continue
[Link](current_url)

# Check for timeout at the beginning of each iteration

if [Link]() - start_time > timeout:
print(f"Crawling of {current_domain} stopped due to timeout (>
{timeout} seconds).")
return None

# Check if any keyword is in the URL itself

for keyword in keywords:
if [Link]() in current_url.lower() if not case_sensitive else
keyword in current_url:
#print(f"Match found in URL: {current_url}")
return {keyword: current_url}

# Skip URLs containing irrelevant words

parsed_url = urlparse(current_url)
path_segments = [[Link]() for segment in parsed_url.[Link]('/')
if segment]
if any(word in path_segments for word in irrelevant_words):
#print(f"Skipping URL due to irrelevant words: {current_url}")
continue

try:
response = [Link](current_url, timeout=10, headers=headers)
response.raise_for_status()
[Link] = response.apparent_encoding

try:
soup = BeautifulSoup([Link], '[Link]',
from_encoding=[Link])
except Exception as e:
[Link](f"Error parsing HTML with BeautifulSoup for
{current_url}: {e}")
continue

# Check for language

if [Link]('html'):
lang = [Link]('html').get('lang', 'en')
if not [Link]('en'):
#print(f"Skipping non-English page: {current_url}")
continue # Skip non-English pages

links = [urljoin(current_url, link['href']) for link in

soup.find_all('a', href=True)]
page_keywords = find_keywords_on_page(current_url, keywords)
if page_keywords:
found_matches.update(page_keywords)
for absolute_url in links:
if absolute_url.startswith(start_url) and absolute_url not in
visited:
[Link](absolute_url)

except [Link] as e:
if [Link].status_code in [404, 403]:
#print(f"Skipping URL due to {[Link].status_code} error:
{current_url}")
continue # Skip this URL and continue with the next one
else:
#print(f"Error fetching URL {current_url}: {e}")
return None
except [Link] as e:
#print(f"Error fetching URL {current_url}: {e}")
return None
except Exception as e:
print(f"Error processing URL {current_url}: {e}")
pass
return found_matches

def process_domain(url, keywords, session, timeout=300):

"""Helper function to process a single domain.
Now takes a session and a timeout."""
start_time = [Link]()
result = crawl_and_search(url, keywords, session, timeout)
end_time = [Link]()
if result is None: # Check for None (timeout or other error)
return None
if end_time - start_time > timeout:
print(f"Crawling of {url} stopped due to timeout (> {timeout} seconds).")
return None
return result

def search_multiple_domains_from_file(filename, keywords, timeout=300):

"""Searches multiple domains from a text file for keywords using a thread
pool."""
try:
with open(filename, 'r') as file:
urls = [[Link]() for line in file if [Link]()]
with [Link](max_workers=4) as executor:
session = [Link]()
future_to_url = {[Link](process_domain, url, keywords,
session, timeout): url for url in urls}
for future in [Link].as_completed(future_to_url):
url = future_to_url[future]
try:
result = [Link]()
if result:
print(f"\nSearching: {url}")
print(f"Found matches in {url}:")
for keyword, match_url in [Link]():
print(f" Keyword: {keyword}, URL: {match_url}")
except Exception as e:
print(f"Error processing domain {url}: {e}")
pass
[Link]()

except FileNotFoundError:
print(f"Error: File '{filename}' not found.")

if __name__ == "__main__":
domains_file = "[Link]"
search_keywords = ["RFP", "Proposal", "Procurement", "Bid"]
case_sensitive = False
timeout_value = 300

start_time = [Link]()
search_multiple_domains_from_file(domains_file, search_keywords,
timeout=timeout_value)
end_time = [Link]()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

LinkedIn Scraper with Python Code
No ratings yet
LinkedIn Scraper with Python Code
4 pages
Fast Subdomain Scanner Tool
No ratings yet
Fast Subdomain Scanner Tool
2 pages
Subdomain Enumeration Tool in Python
No ratings yet
Subdomain Enumeration Tool in Python
11 pages
Web Scraping Quick Start Guide
No ratings yet
Web Scraping Quick Start Guide
7 pages
Python Web Scraping Fundamentals
No ratings yet
Python Web Scraping Fundamentals
12 pages
URL Validation and Login Detection Tool
No ratings yet
URL Validation and Login Detection Tool
35 pages
Analyzing Search Result Biases
No ratings yet
Analyzing Search Result Biases
5 pages
Email Domain Environment Checker
No ratings yet
Email Domain Environment Checker
2 pages
Web Scraping Cheat Sheet Guide
No ratings yet
Web Scraping Cheat Sheet Guide
10 pages
Web Crawler Indexer Implementation
No ratings yet
Web Crawler Indexer Implementation
9 pages
Subdomain Scanner Script in Python
No ratings yet
Subdomain Scanner Script in Python
2 pages
MCX India Spider Data Extraction Script
No ratings yet
MCX India Spider Data Extraction Script
24 pages
Scraping Vegan Restaurant Reviews
No ratings yet
Scraping Vegan Restaurant Reviews
2 pages
Web Scraping and Data Analysis Tool
No ratings yet
Web Scraping and Data Analysis Tool
10 pages
Intelx API Search Script in Python
No ratings yet
Intelx API Search Script in Python
3 pages
Intelx API Search Script Guide
No ratings yet
Intelx API Search Script Guide
3 pages
Web Crawler Assignment for CS 3308
No ratings yet
Web Crawler Assignment for CS 3308
4 pages
DuckDuckGo Search Scraping Tool
No ratings yet
DuckDuckGo Search Scraping Tool
3 pages
Saudi Real Estate Scraper Tool
No ratings yet
Saudi Real Estate Scraper Tool
23 pages
Python Pentesting Cheat Sheet
No ratings yet
Python Pentesting Cheat Sheet
1 page
Markdown Formatting Guide and Code
No ratings yet
Markdown Formatting Guide and Code
2 pages
Hybrid Web Scraping Techniques Overview
No ratings yet
Hybrid Web Scraping Techniques Overview
8 pages
MCX India Spider Data Extraction Script
No ratings yet
MCX India Spider Data Extraction Script
13 pages
CAPTCHA Not Ready Handling in Login
No ratings yet
CAPTCHA Not Ready Handling in Login
4 pages
MCX India Membership Data Scraper
No ratings yet
MCX India Membership Data Scraper
17 pages
Emirates Line Web Scraper Function
No ratings yet
Emirates Line Web Scraper Function
4 pages
VnExpress AI News Pipeline Script
No ratings yet
VnExpress AI News Pipeline Script
12 pages
Web Scraping with Python: Requests & BS4
No ratings yet
Web Scraping with Python: Requests & BS4
41 pages
A Simple Python Web Crawler...
100% (1)
A Simple Python Web Crawler...
5 pages
CS 3308 Info Retrieval Assignment
No ratings yet
CS 3308 Info Retrieval Assignment
8 pages
Python Web Scraping Guide 2023
No ratings yet
Python Web Scraping Guide 2023
11 pages
Emuparadise URL Status Checker Script
No ratings yet
Emuparadise URL Status Checker Script
2 pages
Phishing Detection Flask App Code
No ratings yet
Phishing Detection Flask App Code
10 pages
Hata Yönetimi ve Thread Kontrolü
No ratings yet
Hata Yönetimi ve Thread Kontrolü
24 pages
Multithreaded Web Crawler Project Report
No ratings yet
Multithreaded Web Crawler Project Report
11 pages
Web and Social Media Analytics Lab
No ratings yet
Web and Social Media Analytics Lab
8 pages
MCX India Membership Data Scraper
No ratings yet
MCX India Membership Data Scraper
13 pages
MCX India Membership Data Scraper
No ratings yet
MCX India Membership Data Scraper
13 pages
EvilTool: Web Scraping & Security Features
No ratings yet
EvilTool: Web Scraping & Security Features
3 pages
Accessing Web Data with Python
100% (5)
Accessing Web Data with Python
113 pages
Web Crawling Tutorial in Python
No ratings yet
Web Crawling Tutorial in Python
3 pages
Python Robots.txt Crawler Script
No ratings yet
Python Robots.txt Crawler Script
3 pages
Pseudocode for Article Scraping
No ratings yet
Pseudocode for Article Scraping
14 pages
Web Mining Techniques and Code
No ratings yet
Web Mining Techniques and Code
11 pages
Google Serper API Search Tools
No ratings yet
Google Serper API Search Tools
7 pages
Agent301 Bot Automation Script
No ratings yet
Agent301 Bot Automation Script
5 pages
BeautifulSoup4 Web Scraping Guide
No ratings yet
BeautifulSoup4 Web Scraping Guide
7 pages
Python Domain Scanning Tool
No ratings yet
Python Domain Scanning Tool
7 pages
Web Scraping with BeautifulSoup in Python
No ratings yet
Web Scraping with BeautifulSoup in Python
57 pages
Web Scraping and Data Analysis in Python
No ratings yet
Web Scraping and Data Analysis in Python
109 pages
Crawl4AI Documentation Overview v0.5.x
No ratings yet
Crawl4AI Documentation Overview v0.5.x
253 pages
Python Web Crawler Tutorial
No ratings yet
Python Web Crawler Tutorial
15 pages
Advanced Web Scraping with Python
100% (1)
Advanced Web Scraping with Python
13 pages
Async Event Scraper for Trade Shows
No ratings yet
Async Event Scraper for Trade Shows
3 pages
Domain SQL Injection Vulnerability Tool
No ratings yet
Domain SQL Injection Vulnerability Tool
13 pages
Accounting Principles and Income Statement Guide
No ratings yet
Accounting Principles and Income Statement Guide
5 pages
SPCA Northeastern NC Shelter Certificate Suspended
No ratings yet
SPCA Northeastern NC Shelter Certificate Suspended
14 pages
Company Classifications in Malaysia
No ratings yet
Company Classifications in Malaysia
37 pages
Isola Landscape Trophy 2016-17 Guide
No ratings yet
Isola Landscape Trophy 2016-17 Guide
4 pages
PNU LET Professional Education Reviewer Page 1
67% (12)
PNU LET Professional Education Reviewer Page 1
8 pages
Essential HRMS Accomplishments Guide
No ratings yet
Essential HRMS Accomplishments Guide
9 pages
AC - CDU Size
No ratings yet
AC - CDU Size
20 pages
Marks of Candidates for A.P.S. Exam 2018
No ratings yet
Marks of Candidates for A.P.S. Exam 2018
8 pages
Clear Aligner Technique 1st Edition Sandra Tai Ready To Read
No ratings yet
Clear Aligner Technique 1st Edition Sandra Tai Ready To Read
84 pages
ISO 14001 Self Assessment Checklist FDIS FINAL July 2015 PDF
No ratings yet
ISO 14001 Self Assessment Checklist FDIS FINAL July 2015 PDF
2 pages
RN Resume of Jennilyn Fuentespina
No ratings yet
RN Resume of Jennilyn Fuentespina
3 pages
YouTube Channel RAG Search Tool
No ratings yet
YouTube Channel RAG Search Tool
3 pages
EIA Implementation Timeline Overview
No ratings yet
EIA Implementation Timeline Overview
3 pages
Overview of Samagra Shiksha Abhiyan
No ratings yet
Overview of Samagra Shiksha Abhiyan
30 pages
Chemerical Cookbook Overview
No ratings yet
Chemerical Cookbook Overview
24 pages
Challenges in Accessing Green Bonds in Vietnam
No ratings yet
Challenges in Accessing Green Bonds in Vietnam
12 pages
Case Study 3298 Sendje Ceiba A4
No ratings yet
Case Study 3298 Sendje Ceiba A4
1 page
Overview of Research Design Types
100% (1)
Overview of Research Design Types
28 pages
Laser Diffraction for Particle Size Analysis
No ratings yet
Laser Diffraction for Particle Size Analysis
11 pages
Product Information Flyer: CIMSTAR® 3890
No ratings yet
Product Information Flyer: CIMSTAR® 3890
2 pages
Bio-Based PA11 Composites with Recycled Carbon Fibers
No ratings yet
Bio-Based PA11 Composites with Recycled Carbon Fibers
19 pages
Washmatic India Pvt Ltd Safety Data
No ratings yet
Washmatic India Pvt Ltd Safety Data
4 pages
One-Line Communication System Diagram
No ratings yet
One-Line Communication System Diagram
1 page
Motion to Resolve Case Against Matidios
75% (8)
Motion to Resolve Case Against Matidios
2 pages
UMA vs NUMA: Key Differences Explained
No ratings yet
UMA vs NUMA: Key Differences Explained
10 pages
Qatar Energy Permit to Work Overview
No ratings yet
Qatar Energy Permit to Work Overview
14 pages
Understanding Cheque Truncation System
No ratings yet
Understanding Cheque Truncation System
5 pages
Aseptic Techniques in Microbiology
No ratings yet
Aseptic Techniques in Microbiology
18 pages
Nepal Earthquake - One Month in Action
No ratings yet
Nepal Earthquake - One Month in Action
20 pages
2024 OECD Tax Incentives Update
No ratings yet
2024 OECD Tax Incentives Update
35 pages