URL Validation and Login Detection Tool
URL Validation and Login Detection Tool
py
import argparse
import logging
import os
import time
import json
import signal
import sys
from datetime import datetime
import requests
import tldextract
import threading
from queue import Queue
from [Link] import RotatingFileHandler
load_dotenv(override=True)
# Yardımcı Fonksiyonlar
parsed = urlparse(url)
path = [Link]()
netloc = [Link]()
def normalize_url(url):
"""
Normalize URL by adding https:// or removing fragment
"""
try:
parsed_url = urlparse(url)
if not parsed_url.scheme:
# If there is no scheme, assume '[Link] and add it
url = '[Link] + url
parsed_url = urlparse(url)
elif parsed_url.scheme not in ('http', 'https'):
# If the scheme is invalid, return None
return None
def make_array_to_dic(url_listesi):
"""
Converts a list of URLs into a dictionary grouped by main domains.
return dict(domain_sozluk)
detected_url = None
for detector_name, detector_func in detectors:
detected_url = detector_func(site)
if detected_url:
[Link](f"Login form detected by **{detector_name}** on site:
{detected_url}")
login_urls.append(detected_url)
break
else:
try:
[Link](f"**{detector_name}** could not find a form on site:
{site}")
except OSError as e:
# Loglama hatasını ele alıyoruz
print(f"Loglama hatası: {e}")
return detected_url
if not driver_instance.check_url_matches_base(normalized_link):
with processed_urls_lock:
processed_urls.add(normalized_link)
continue
with url_list_lock:
if normalized_link not in url_list and normalized_link not in
processed_urls:
enqueue_url(url_queue, normalized_link) # retry_count default 0
url_list.add(normalized_link)
def get_initial_sites(env):
##hidden
initial_sites_dic = make_array_to_dic(initial_sites)
return initial_sites_dic
def write_detected_logins_to_json(domain, detected_logins, detected_json_file,
json_lock):
data = {
"domain": domain,
"detected_logins": detected_logins,
"detected_at": [Link]().isoformat()
}
try:
with json_lock:
with open(detected_json_file, 'w') as file:
[Link](data, file, indent=4)
[Link](f"Detected logins for {domain} saved to
{detected_json_file}.")
except Exception as e:
[Link](f"Error writing detected logins JSON file
{detected_json_file} for {domain}: {e}", exc_info=True)
if env == "staging":
post_url = "hidden"
elif env == "dev":
post_url = "hidden"
elif env == "prod":
post_url = "hidden"
try:
post_queue.put(session_data)
[Link](f"Session data for {session_data['domain']} enqueued for
posting.")
except Exception as e:
[Link](f"Error enqueuing session data for {session_data['domain']}:
{e}", exc_info=True)
def check_internet_connection():
"""
Basit bir internet bağlantısı kontrolü.
"""
try:
[Link]('[Link] timeout=5)
return True
except [Link]:
return False
driver_instance = None
driver = None
# WebDriver'ı başlatma
try:
driver_instance = StandardChromeDriver(
site=base_domain, # Base domain bilgisi ile başlatılıyor
keywords=[],
login_pages=[],
instance_id=instance_id
)
driver = driver_instance.get_driver()
if driver is None:
[Link](f"Instance {instance_id}: WebDriver başlatılamadı.")
return # WebDriver başlatılamazsa thread'i sonlandır
# Timeout ayarları
driver.set_page_load_timeout(120) # Sayfa yükleme için 120 saniye
driver.set_script_timeout(120) # Script çalıştırma için 120 saniye
except Exception as e:
[Link](f"Instance {instance_id}: WebDriver başlatma hatası: {e}")
return # WebDriver başlatılamazsa thread'i sonlandır
try:
try:
[Link](current_site)
normalized_site = normalize_url(current_site)
except TimeoutException as e:
[Link](f"Instance {instance_id} siteye gitme zaman aşımına
uğradı: {current_site}. Hata: {e}")
with error_urls_lock:
error_logins.append({"url": current_site, "error": "Page load
timed out."})
continue # Sonraki siteye geç
try:
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
[Link](f"Instance {instance_id} sayfa yüklendi:
{current_site}")
except TimeoutException as te:
[Link](f"Instance {instance_id} sayfa yükleme zaman aşımına
uğradı: {current_site}. Hata: {te}")
with error_urls_lock:
error_logins.append({"url": current_site, "error": "Page
loading timed out."})
continue # Sonraki siteye geç
except WebDriverException as we:
[Link](f"Instance {instance_id} WebDriver hatası:
{current_site}. Hata: {we}")
# WebDriver çökmesi durumunda URL'yi tekrar kuyruğa ekle
with error_urls_lock:
error_logins.append({"url": current_site, "error": str(we)})
if retry_count < MAX_RETRIES:
enqueue_url(url_queue, current_site, retry_count + 1)
[Link](f"Instance {instance_id}: {current_site} tekrar
kuyruğa eklendi (Retry: {retry_count + 1}).")
# Mevcut WebDriver'ı kapat ve yenisini başlat
try:
driver_instance.quit_driver()
except:
pass
try:
driver_instance = StandardChromeDriver(
site=base_domain, # Yeni WebDriver'ı aynı base domain ile
başlat
keywords=[],
login_pages=[],
instance_id=instance_id
)
driver = driver_instance.get_driver()
driver.set_page_load_timeout(120)
driver.set_script_timeout(120)
[Link](f"Instance {instance_id}: Yeni WebDriver
başlatıldı.")
except Exception as e:
[Link](f"Instance {instance_id}: Yeni WebDriver başlatma
hatası: {e}")
return # Yeni WebDriver başlatılamazsa thread'i sonlandır
continue # Sonraki siteye geç
if detected_url:
for url in login_urls:
driver_instance.save_login_page(url)
with output_lock:
detected_logins.append({
"url": url,
"detected_at": [Link]().isoformat()
})
else:
[Link](f"Instance {instance_id}: Login form bulunamadı:
{current_site}")
# Diğer işlemler
elements = driver_instance.detect_elements_with_login()
detected_links = [
urljoin(current_site, element.get_attribute('href') or
element.get_attribute('action') or
element.get_attribute('onclick'))
for element in elements
if (element.get_attribute('href') or
element.get_attribute('action') or element.get_attribute('onclick'))
]
if not detected_links:
[Link](f"**detect_elements_with_login** ilgili bağlantı
bulamadı: {current_site}")
else:
[Link](f"**process_detected_links: {current_site}")
process_detected_links(driver_instance, detected_links,
instance_id, url_queue, url_list, processed_urls, url_list_lock,
processed_urls_lock, keyword_manager)
if driver_instance:
driver_instance.quit_driver()
if batch:
for session_data in batch:
try:
response = [Link](url=post_url, verify=False,
json=session_data)
if response.status_code == 200:
[Link](f"Successfully posted session data for
{session_data['domain']} to {post_url}")
elif response.status_code == 400:
[Link](f"Invalid JSON format in the request body for
{session_data['domain']}: {response.status_code}")
elif response.status_code == 500:
[Link](f"Internal server error on test for
{session_data['domain']}: {response.status_code}")
else:
[Link](f"Something went wrong on test for
{session_data['domain']}: {response.status_code}")
except Exception as e:
[Link](f"Error posting session data for
{session_data['domain']}: {e}", exc_info=True)
else:
# Eğer kuyruğa veri eklenmediyse kısa bir süre bekle
[Link](5)
try:
driver_instance = StandardChromeDriver(
site=site,
keywords=[],
login_pages=[],
instance_id=instance_id
)
driver = driver_instance.get_driver()
if driver is None:
[Link](f"WebDriver başlatılamadı {instance_id}: {site}")
with error_urls_lock:
error_logins.append({"url": site, "error": "WebDriver could not be
started."})
return
try:
with processed_urls_lock:
processed_urls.add(site)
login_urls = []
detected_url = detect_login_in_site(driver_instance, site, login_urls,
instance_id)
if detected_url:
for url in login_urls:
driver_instance.save_login_page(url)
with output_lock:
detected_logins.append({
"url": url,
"detected_at": [Link]().isoformat()
})
else:
[Link](f"Başlangıç aşamasında login formu bulunamadı:
{site}")
if not detected_links:
[Link](f"**detect_elements_with_login** ilgili bağlantı
bulamadı: {site}")
else:
process_detected_links(driver_instance, detected_links,
instance_id, url_queue, url_list, processed_urls, url_list_lock,
processed_urls_lock, keyword_manager)
except TimeoutException as te:
[Link](f"Instance {instance_id} siteyi işlerken zaman aşımına
uğradı: {site}. Hata: {te}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": "TimeoutException during
processing."})
except WebDriverException as we:
[Link](f"Instance {instance_id} WebDriver hatası siteyi
işlerken: {site}. Hata: {we}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": str(we)})
except Exception as e:
[Link](f"Instance {instance_id} siteyi işlerken hata oluştu:
{site}. Hata: {e}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": str(e)})
except Exception as e:
[Link](f"Instance {instance_id} istisna aldı: {e}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": str(e)})
finally:
driver_instance.quit_driver()
def setup_logging(log_directory):
[Link](log_directory, exist_ok=True)
log_file = f"login-page-finder.{[Link]().strftime('%d.%m.%y')}.log"
log_path = [Link](log_directory, log_file)
logger = [Link]()
[Link]([Link])
[Link](f"LOG_PATH: {log_path}")
return log_path
def parse_arguments():
parser = [Link](description="Login Page Finder")
parser.add_argument(help='host(s) to scan', dest='host', nargs='?')
parser.add_argument('--env', choices=['staging', 'dev', 'prod'],
help='Environment to use', dest='env', default='dev')
return parser.parse_args()
def main():
args = parse_arguments()
env = [Link]
log_directory = "log/"
detected_logs_directory = "detectedlogs/"
error_logs_directory = "errorlogs/"
setup_logging(log_directory)
[Link](detected_logs_directory, exist_ok=True)
[Link](error_logs_directory, exist_ok=True)
[Link](f"NUM_INSTANCES: {num_instances}")
THREAD_TIMEOUT_SECONDS = int([Link]("THREAD_TIMEOUT_SECONDS", "60")) # 60
saniye olarak ayarlandı
[Link](f"THREAD_TIMEOUT_SECONDS: {THREAD_TIMEOUT_SECONDS}")
initial_sites_dict = get_initial_sites(env)
if not initial_sites_dict:
[Link]("İşlenecek site bulunamadı.")
return
keyword_manager = KeywordManager()
json_lock = [Link]()
error_urls_lock = [Link]()
processed_urls_lock = [Link]()
url_list_lock = [Link]()
output_lock = [Link]()
exit_event = [Link]()
threads = []
# Normalize domain
normalized_domain = normalize_url(domain)
if normalized_domain and is_valid_url(normalized_domain, keyword_manager):
with url_list_lock:
if normalized_domain not in url_list and normalized_domain not in
processed_urls:
enqueue_url(url_queue, normalized_domain) # retry_count
default 0
url_list.add(normalized_domain)
processed_urls.add(normalized_domain)
else:
[Link](f"Skipping initial domain due to invalid URL or unwanted
extension: {domain}")
try:
while not exit_event.is_set():
if reconnect_event.is_set():
# İnternet bağlantısı kesildi, thread'leri kapat ve kuyruğu
bekle
[Link]("İnternet bağlantısı kesildi, thread'ler
kapatılıyor.")
for thread in threads:
[Link](timeout=1)
[Link]()
monitor_thread.join()
# Bilinen login URL'lerini sonuçlara ekleyin
known_logins = present_known_domains.get(domain, set())
if known_logins:
with output_lock:
for login_url in known_logins:
detected_logins_session.append({
"url": login_url,
"detected_at": [Link]().isoformat(),
"source": "present_known_domains"
})
[Link](f"Added known login URLs for {domain} to detected
logins.")
with url_list_lock:
url_list.clear()
[Link](f"url_list temizlendi: {domain} için.")
with processed_urls_lock:
processed_urls.clear()
[Link](f"processed_urls temizlendi: {domain} için.")
[Link]()
[Link](f"Thread listesi temizlendi: {domain} için.")
# keyword_manager.py
# You can choose the word lists which the program uses in its formulas.
# For example, when the program detects a page that is not a login page, you can
delete the keyword that caused the program to detect that page as a login page.
# The program uses login_keywords, username_keywords, password_keywords,
submit_keywords for detection.
# The program does not use common_login_paths and angular_keywords; you can pass
them.
# You can decide unwanted extensions via unwanted_extension_keywords because you
know that PDFs do not have login pages.
# You can decide unwanted keywords via unwanted_extension_keywords. Thanks to this
list, we can block specific URLs; for example, you can block [Link]
but still be able to connect to [Link]
# You can ban an entire domain via unwanted_domains.
# present_known_domains is a development part of the code; it's not finished yet.
# I am trying to ensure that when the program works on a domain, it automatically
adds that domain to detected logins, so we can also add banned domains' login pages
to detected logins.
class KeywordManager:
def __init__(self):
self.login_keywords = [ #keywords about login, sign in, etc...
'login', 'signin', 'giriş', 'oturum aç', 'kullanıcı girişi','login-2',
'log in', 'sign in', 'entrance', 'access', 'member', 'authenticate',
'logon', 'connect', 'connexion', 'account', 'user', 'signon',
'iniciar sesión', '登录', '注册', 'вход', 'войти', 'accedi', 'anmelden',
'se connecter', 'entrar', 'ログイン', 'サインイン', 'hesap', 'kaydol', 'üye
ol,',
' Müşteri No / T.C. Kimlik No', 'Müşteri No,','T.C. Kimlik No'
]
self.username_keywords = [ #keywords about username, üye ol, etc...
'user', 'username', 'email', 'e-mail', 'kullanıcı', 'e-posta', 'e-posta
adresi', 'mail',
'correo', 'usuario', 'ユーザー', 'メールアドレス', '用户名', '电子邮件', '账户',
'utilisateur', 'benutzername',
'customerOrIdentityNumber', 'üye ol', 'Müşteri No / T.C. Kimlik No',
'Müşteri No',
'T.C. Kimlik No', 'customerNumber', 'userNumber', 'müşteri no',
'kullanıcı no','userNameInput' ,'User Account','someone@[Link]',
'kunyeYayinlayan'
]
self.password_keywords = [
'password', 'pass', 'şifre', 'parola', 'contraseña', 'mot de passe',
'passwort', '密码', 'パスワード', 'пароль', 'senha', 'sifre','sifre',
'şifrə','passwordInput'
]
self.submit_keywords = [
'login', 'signin', 'giriş', 'oturum aç', 'log in', 'sign in',
'send', 'enter', 'continue', 'logowanie', 'accedi',
'Yeni Şifre Al / Şifremi Unuttum','Şifremi Unuttum', 'Yeni Şifre Al',
]
self.common_login_paths = [
'/login', '/signin', '/user/login', '/account/login', '/admin/login',
'/[Link]', '/auth/login', '/[Link]', '/giris', '/oturum-ac',
'login-2', 'login-2', 'pttweb',
'/auth/register'
]
self.angular_keywords = [
'ng-app', 'ng-controller', '__ngContext__', '@angular', 'ng-version',
'[Link]', 'ngIf', 'ngFor', 'ngModel', 'ngClass', 'ngStyle',
'data-ng-app', 'data-ng-controller', 'ng-bind', 'ng-repeat'
]
self.unwanted_extension_keywords = {
'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.ico', '.pdf',
'.zip', '.doc' , '.docx' , '.xls' , '.xlsx'
}
self.unwanted_keywords = {
'video-id', 'photos',
'kutuphane','baslatAnket','baslatSanalTur','duyuru','haber','ekotaban','saybis'
}
self.unwanted_domains = {
'[Link]',
'[Link]',
'[Link]
'[Link]
'[Link]
'[Link]
'[Link]
'[Link]
}
self.present_known_domains ={
'[Link]
p=2&devam=2f796f7264616d2f3f703d302664696c3d33&dil=0#girisForm',
'[Link]
'[Link]
'[Link]
'[Link]
'[Link]
%C4%B1OturumuA%C3%A7ma'
}
# [Link]
import argparse
import logging
import os
import time
import json
import signal
import sys
from datetime import datetime
import requests
import tldextract
import threading
from queue import Queue
from [Link] import RotatingFileHandler
load_dotenv(override=True)
# Yardımcı Fonksiyonlar
parsed = urlparse(url)
path = [Link]()
netloc = [Link]()
def normalize_url(url):
"""
Normalize URL by adding https:// or removing fragment
"""
try:
parsed_url = urlparse(url)
if not parsed_url.scheme:
# If there is no scheme, assume '[Link] and add it
url = '[Link] + url
parsed_url = urlparse(url)
elif parsed_url.scheme not in ('http', 'https'):
# If the scheme is invalid, return None
return None
def make_array_to_dic(url_listesi):
"""
Converts a list of URLs into a dictionary grouped by main domains.
detectors = [
("Login Form Detector", driver_instance.detect_login_form),
("Submit Button Detector", driver_instance.detect_submit_button),
("Input Field Detector", driver_instance.detect_input_fields_with_login),
("Shadow DOM Detector", driver_instance.detect_shadow_dom_with_login)
]
detected_url = None
for detector_name, detector_func in detectors:
detected_url = detector_func(site)
if detected_url:
[Link](f"Login form detected by **{detector_name}** on site:
{detected_url}")
login_urls.append(detected_url)
break
else:
try:
[Link](f"**{detector_name}** could not find a form on site:
{site}")
except OSError as e:
# Loglama hatasını ele alıyoruz
print(f"Loglama hatası: {e}")
return detected_url
if not driver_instance.check_url_matches_base(normalized_link):
with processed_urls_lock:
processed_urls.add(normalized_link)
continue
with url_list_lock:
if normalized_link not in url_list and normalized_link not in
processed_urls:
enqueue_url(url_queue, normalized_link) # retry_count default 0
url_list.add(normalized_link)
def enqueue_url(url_queue, url, retry_count=0):
url_queue.put((url, retry_count))
def get_initial_sites(env):
##hidden
initial_sites_dic = make_array_to_dic(initial_sites)
return initial_sites_dic
if env == "staging":
post_url = "hidden"
elif env == "dev":
post_url = "hidden"
elif env == "prod":
post_url = "hidden"
try:
post_queue.put(session_data)
[Link](f"Session data for {session_data['domain']} enqueued for
posting.")
except Exception as e:
[Link](f"Error enqueuing session data for {session_data['domain']}:
{e}", exc_info=True)
def check_internet_connection():
"""
Basit bir internet bağlantısı kontrolü.
"""
try:
[Link]('[Link] timeout=5)
return True
except [Link]:
return False
# WebDriver'ı başlatma
try:
driver_instance = StandardChromeDriver(
site=base_domain, # Base domain bilgisi ile başlatılıyor
keywords=[],
login_pages=[],
instance_id=instance_id
)
driver = driver_instance.get_driver()
if driver is None:
[Link](f"Instance {instance_id}: WebDriver başlatılamadı.")
return # WebDriver başlatılamazsa thread'i sonlandır
# Timeout ayarları
driver.set_page_load_timeout(120) # Sayfa yükleme için 120 saniye
driver.set_script_timeout(120) # Script çalıştırma için 120 saniye
except Exception as e:
[Link](f"Instance {instance_id}: WebDriver başlatma hatası: {e}")
return # WebDriver başlatılamazsa thread'i sonlandır
try:
try:
[Link](current_site)
normalized_site = normalize_url(current_site)
except TimeoutException as e:
[Link](f"Instance {instance_id} siteye gitme zaman aşımına
uğradı: {current_site}. Hata: {e}")
with error_urls_lock:
error_logins.append({"url": current_site, "error": "Page load
timed out."})
continue # Sonraki siteye geç
try:
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
[Link](f"Instance {instance_id} sayfa yüklendi:
{current_site}")
except TimeoutException as te:
[Link](f"Instance {instance_id} sayfa yükleme zaman aşımına
uğradı: {current_site}. Hata: {te}")
with error_urls_lock:
error_logins.append({"url": current_site, "error": "Page
loading timed out."})
continue # Sonraki siteye geç
except WebDriverException as we:
[Link](f"Instance {instance_id} WebDriver hatası:
{current_site}. Hata: {we}")
# WebDriver çökmesi durumunda URL'yi tekrar kuyruğa ekle
with error_urls_lock:
error_logins.append({"url": current_site, "error": str(we)})
if retry_count < MAX_RETRIES:
enqueue_url(url_queue, current_site, retry_count + 1)
[Link](f"Instance {instance_id}: {current_site} tekrar
kuyruğa eklendi (Retry: {retry_count + 1}).")
# Mevcut WebDriver'ı kapat ve yenisini başlat
try:
driver_instance.quit_driver()
except:
pass
try:
driver_instance = StandardChromeDriver(
site=base_domain, # Yeni WebDriver'ı aynı base domain ile
başlat
keywords=[],
login_pages=[],
instance_id=instance_id
)
driver = driver_instance.get_driver()
driver.set_page_load_timeout(120)
driver.set_script_timeout(120)
[Link](f"Instance {instance_id}: Yeni WebDriver
başlatıldı.")
except Exception as e:
[Link](f"Instance {instance_id}: Yeni WebDriver başlatma
hatası: {e}")
return # Yeni WebDriver başlatılamazsa thread'i sonlandır
continue # Sonraki siteye geç
if detected_url:
for url in login_urls:
driver_instance.save_login_page(url)
with output_lock:
detected_logins.append({
"url": url,
"detected_at": [Link]().isoformat()
})
else:
[Link](f"Instance {instance_id}: Login form bulunamadı:
{current_site}")
# Diğer işlemler
elements = driver_instance.detect_elements_with_login()
detected_links = [
urljoin(current_site, element.get_attribute('href') or
element.get_attribute('action') or
element.get_attribute('onclick'))
for element in elements
if (element.get_attribute('href') or
element.get_attribute('action') or element.get_attribute('onclick'))
]
if not detected_links:
[Link](f"**detect_elements_with_login** ilgili bağlantı
bulamadı: {current_site}")
else:
[Link](f"**process_detected_links: {current_site}")
process_detected_links(driver_instance, detected_links,
instance_id, url_queue, url_list, processed_urls, url_list_lock,
processed_urls_lock, keyword_manager)
if driver_instance:
driver_instance.quit_driver()
if batch:
for session_data in batch:
try:
response = [Link](url=post_url, verify=False,
json=session_data)
if response.status_code == 200:
[Link](f"Successfully posted session data for
{session_data['domain']} to {post_url}")
elif response.status_code == 400:
[Link](f"Invalid JSON format in the request body for
{session_data['domain']}: {response.status_code}")
elif response.status_code == 500:
[Link](f"Internal server error on test for
{session_data['domain']}: {response.status_code}")
else:
[Link](f"Something went wrong on test for
{session_data['domain']}: {response.status_code}")
except Exception as e:
[Link](f"Error posting session data for
{session_data['domain']}: {e}", exc_info=True)
else:
# Eğer kuyruğa veri eklenmediyse kısa bir süre bekle
[Link](5)
try:
driver_instance = StandardChromeDriver(
site=site,
keywords=[],
login_pages=[],
instance_id=instance_id
)
driver = driver_instance.get_driver()
if driver is None:
[Link](f"WebDriver başlatılamadı {instance_id}: {site}")
with error_urls_lock:
error_logins.append({"url": site, "error": "WebDriver could not be
started."})
return
try:
with processed_urls_lock:
processed_urls.add(site)
login_urls = []
detected_url = detect_login_in_site(driver_instance, site, login_urls,
instance_id)
if detected_url:
for url in login_urls:
driver_instance.save_login_page(url)
with output_lock:
detected_logins.append({
"url": url,
"detected_at": [Link]().isoformat()
})
else:
[Link](f"Başlangıç aşamasında login formu bulunamadı:
{site}")
except Exception as e:
[Link](f"Instance {instance_id} istisna aldı: {e}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": str(e)})
finally:
driver_instance.quit_driver()
def setup_logging(log_directory):
[Link](log_directory, exist_ok=True)
log_file = f"login-page-finder.{[Link]().strftime('%d.%m.%y')}.log"
log_path = [Link](log_directory, log_file)
logger = [Link]()
[Link]([Link])
[Link](f"LOG_PATH: {log_path}")
return log_path
def parse_arguments():
parser = [Link](description="Login Page Finder")
parser.add_argument(help='host(s) to scan', dest='host', nargs='?')
parser.add_argument('--env', choices=['staging', 'dev', 'prod'],
help='Environment to use', dest='env', default='dev')
return parser.parse_args()
def main():
args = parse_arguments()
env = [Link]
log_directory = "log/"
detected_logs_directory = "detectedlogs/"
error_logs_directory = "errorlogs/"
setup_logging(log_directory)
[Link](detected_logs_directory, exist_ok=True)
[Link](error_logs_directory, exist_ok=True)
[Link](f"NUM_INSTANCES: {num_instances}")
THREAD_TIMEOUT_SECONDS = int([Link]("THREAD_TIMEOUT_SECONDS", "60")) # 60
saniye olarak ayarlandı
[Link](f"THREAD_TIMEOUT_SECONDS: {THREAD_TIMEOUT_SECONDS}")
initial_sites_dict = get_initial_sites(env)
if not initial_sites_dict:
[Link]("İşlenecek site bulunamadı.")
return
keyword_manager = KeywordManager()
json_lock = [Link]()
error_urls_lock = [Link]()
processed_urls_lock = [Link]()
url_list_lock = [Link]()
output_lock = [Link]()
exit_event = [Link]()
threads = []
# Normalize domain
normalized_domain = normalize_url(domain)
if normalized_domain and is_valid_url(normalized_domain, keyword_manager):
with url_list_lock:
if normalized_domain not in url_list and normalized_domain not in
processed_urls:
enqueue_url(url_queue, normalized_domain) # retry_count
default 0
url_list.add(normalized_domain)
processed_urls.add(normalized_domain)
else:
[Link](f"Skipping initial domain due to invalid URL or unwanted
extension: {domain}")
try:
while not exit_event.is_set():
if reconnect_event.is_set():
# İnternet bağlantısı kesildi, thread'leri kapat ve kuyruğu
bekle
[Link]("İnternet bağlantısı kesildi, thread'ler
kapatılıyor.")
for thread in threads:
[Link](timeout=1)
[Link]()
monitor_thread.join()
with url_list_lock:
url_list.clear()
[Link](f"url_list temizlendi: {domain} için.")
with processed_urls_lock:
processed_urls.clear()
[Link](f"processed_urls temizlendi: {domain} için.")
[Link]()
[Link](f"Thread listesi temizlendi: {domain} için.")
[Link](f"Tüm siteler işlendi ve {domain} için JSON dosyalarına
yazıldı.")
if __name__ == "__main__":
main()