0% found this document useful (0 votes)
57 views35 pages

URL Validation and Login Detection Tool

The document is a Python script that implements a web scraping tool using Selenium to detect login forms on various websites. It includes functions for URL validation, normalization, and error handling, as well as threading for concurrent processing of URLs. The script also logs detected logins and errors to JSON files and monitors internet connectivity to ensure consistent operation.

Uploaded by

w6vzw75yrf
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
57 views35 pages

URL Validation and Login Detection Tool

The document is a Python script that implements a web scraping tool using Selenium to detect login forms on various websites. It includes functions for URL validation, normalization, and error handling, as well as threading for concurrent processing of URLs. The script also logs detected logins and errors to JSON files and monitors internet connectivity to ensure consistent operation.

Uploaded by

w6vzw75yrf
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd

# main.

py

import argparse
import logging
import os
import time
import json
import signal
import sys
from datetime import datetime
import requests
import tldextract

from collections import defaultdict


from [Link] import WebDriverWait
from [Link] import expected_conditions as EC
from [Link] import By
from [Link] import TimeoutException, WebDriverException
from browser_driver import StandardChromeDriver
from keyword_manager import KeywordManager
from [Link] import urlparse, urlunparse, urljoin
from dotenv import load_dotenv

import threading
from queue import Queue
from [Link] import RotatingFileHandler

# Yeni eklenen modüller


import queue
import threading
import time

load_dotenv(override=True)

# Maksimum deneme sayısı


MAX_RETRIES = 3

# Yardımcı Fonksiyonlar

def is_valid_url(url, keyword_manager):


"""
URL'nin istenmeyen dosya uzantısıyla bitip bitmediğini, istenmeyen anahtar
kelimeler içerip içermediğini veya istenmeyen domain olup olmadığını kontrol eder.
"""
unwanted_extension_keywords = keyword_manager.get_unwanted_extension_keywords()
unwanted_keywords = keyword_manager.get_unwanted_keywords()
unwanted_domains = keyword_manager.get_unwanted_domains()

parsed = urlparse(url)
path = [Link]()
netloc = [Link]()

[Link](f"URL Kontrolü: {url}")


[Link](f"Path: {path}, Netloc: {netloc}")

# URL extension check


if any([Link](ext) for ext in unwanted_extension_keywords):
[Link](f"URL '{url}' is invalid due to unwanted extension in path.")
return False
# Keywords in path check
if any(keyword in path for keyword in unwanted_keywords):
[Link](f"URL '{url}' is invalid due to unwanted keyword in path.")
return False

# Unwanted domains check


if any(netloc == domain for domain in unwanted_domains):
[Link](f"URL '{url}' is invalid because it's in the unwanted domains
list.")
return False

[Link](f"URL '{url}' geçerli.")


return True

def normalize_url(url):
"""
Normalize URL by adding https:// or removing fragment
"""
try:
parsed_url = urlparse(url)
if not parsed_url.scheme:
# If there is no scheme, assume '[Link] and add it
url = '[Link] + url
parsed_url = urlparse(url)
elif parsed_url.scheme not in ('http', 'https'):
# If the scheme is invalid, return None
return None

parsed_url = parsed_url._replace(fragment='') # Remove the fragment


normalized_url = urlunparse(parsed_url)
return normalized_url
except Exception as e:
[Link](f"URL normalization failed: {url}. Error: {e}")
return None

def make_array_to_dic(url_listesi):
"""
Converts a list of URLs into a dictionary grouped by main domains.

:param url_listesi: List of URLs containing domain and subdomains


:return: Dictionary grouped by main domains
"""
domain_sozluk = defaultdict(list)

for url in url_listesi:


extracted = [Link](url)
# Create main domain (domain + suffix)
ana_domain = f"{[Link]}.{[Link]}".lower()
domain_sozluk[ana_domain].append(url)

return dict(domain_sozluk)

def detect_login_in_site(driver_instance, site, login_urls, instance_id):


"""
Function which run detectors in browser_driver
"""
[Link](f"Instance {instance_id} checking for login form on site: {site}")
detectors = [
("Login Form Detector", driver_instance.detect_login_form),
("Submit Button Detector", driver_instance.detect_submit_button),
("Input Field Detector", driver_instance.detect_input_fields_with_login),
("Shadow DOM Detector", driver_instance.detect_shadow_dom_with_login)
]

detected_url = None
for detector_name, detector_func in detectors:
detected_url = detector_func(site)
if detected_url:
[Link](f"Login form detected by **{detector_name}** on site:
{detected_url}")
login_urls.append(detected_url)
break
else:
try:
[Link](f"**{detector_name}** could not find a form on site:
{site}")
except OSError as e:
# Loglama hatasını ele alıyoruz
print(f"Loglama hatası: {e}")

return detected_url

def process_detected_links(driver_instance, detected_links, instance_id, url_queue,


url_list, processed_urls, url_list_lock, processed_urls_lock, keyword_manager):
for link in detected_links:
normalized_link = normalize_url(link)
if not normalized_link:
continue

if not is_valid_url(normalized_link, keyword_manager):


[Link](f"Skipping URL with unwanted extension or keyword:
{normalized_link}")
with processed_urls_lock:
processed_urls.add(normalized_link)
continue

if not driver_instance.check_url_matches_base(normalized_link):
with processed_urls_lock:
processed_urls.add(normalized_link)
continue

with url_list_lock:
if normalized_link not in url_list and normalized_link not in
processed_urls:
enqueue_url(url_queue, normalized_link) # retry_count default 0
url_list.add(normalized_link)

def enqueue_url(url_queue, url, retry_count=0):


url_queue.put((url, retry_count))

def get_initial_sites(env):
##hidden

initial_sites_dic = make_array_to_dic(initial_sites)

return initial_sites_dic
def write_detected_logins_to_json(domain, detected_logins, detected_json_file,
json_lock):
data = {
"domain": domain,
"detected_logins": detected_logins,
"detected_at": [Link]().isoformat()
}
try:
with json_lock:
with open(detected_json_file, 'w') as file:
[Link](data, file, indent=4)
[Link](f"Detected logins for {domain} saved to
{detected_json_file}.")
except Exception as e:
[Link](f"Error writing detected logins JSON file
{detected_json_file} for {domain}: {e}", exc_info=True)

def write_errors_to_json(domain, error_logins, error_json_file, json_lock):


data = {
"domain": domain,
"errors": error_logins,
"detected_at": [Link]().isoformat()
}
try:
with json_lock:
with open(error_json_file, 'w') as file:
[Link](data, file, indent=4)
[Link](f"Errors for {domain} saved to {error_json_file}.")
except Exception as e:
[Link](f"Error writing errors JSON file {error_json_file} for
{domain}: {e}", exc_info=True)

response = [Link](url=post_url, verify=False, json=session_data)

def post_session_data(env, session_data):

if env == "staging":
post_url = "hidden"
elif env == "dev":
post_url = "hidden"
elif env == "prod":
post_url = "hidden"

try:
post_queue.put(session_data)
[Link](f"Session data for {session_data['domain']} enqueued for
posting.")
except Exception as e:
[Link](f"Error enqueuing session data for {session_data['domain']}:
{e}", exc_info=True)

def check_internet_connection():
"""
Basit bir internet bağlantısı kontrolü.
"""
try:
[Link]('[Link] timeout=5)
return True
except [Link]:
return False

def signal_handler(sig, frame, exit_event, threads):


[Link]("CTRL + C detected! Closing all threads...")
exit_event.set()
# Wait for all threads to finish
for thread in threads:
[Link](timeout=1)
[Link]("All threads have been signaled to exit.")
[Link](0)

def internet_monitor(exit_event, reconnect_event, url_queue, detected_logins,


error_logins, json_lock, error_urls_lock, url_list_lock, processed_urls_lock,
keyword_manager, domain):
"""
İnternet bağlantısını izler ve bağlantı sağlandığında yeniden başlatır.
"""
while not exit_event.is_set():
if not check_internet_connection():
[Link]("İnternet bağlantısı kesildi.")
# Bağlantı kesildiğinde çıkış olayını tetikle
reconnect_event.set()
# Bağlantı geri gelene kadar bekle
while not check_internet_connection():
if exit_event.is_set():
break
[Link]("Bağlantı sağlanamadı, 5 dakika sonra tekrar
denenecek.")
[Link](300) # 5 dakika bekle
if not exit_event.is_set():
[Link]("İnternet bağlantısı geri geldi. Thread'ler yeniden
başlatılıyor.")
reconnect_event.clear()
[Link](60) # Her dakika kontrol et

def worker(detected_logins, error_logins, output_lock, instance_id,


url_queue, url_list, processed_urls, error_urls, exit_event, json_lock,
error_urls_lock, url_list_lock, processed_urls_lock, keyword_manager,
base_domain):
THREAD_TIMEOUT_SECONDS = 60 # 1 dakika olarak ayarlandı
last_active_time = [Link]()

driver_instance = None
driver = None

# WebDriver'ı başlatma
try:
driver_instance = StandardChromeDriver(
site=base_domain, # Base domain bilgisi ile başlatılıyor
keywords=[],
login_pages=[],
instance_id=instance_id
)
driver = driver_instance.get_driver()
if driver is None:
[Link](f"Instance {instance_id}: WebDriver başlatılamadı.")
return # WebDriver başlatılamazsa thread'i sonlandır
# Timeout ayarları
driver.set_page_load_timeout(120) # Sayfa yükleme için 120 saniye
driver.set_script_timeout(120) # Script çalıştırma için 120 saniye
except Exception as e:
[Link](f"Instance {instance_id}: WebDriver başlatma hatası: {e}")
return # WebDriver başlatılamazsa thread'i sonlandır

[Link](f"{base_domain}-Instance-{instance_id} thread'i başladı.")

while not exit_event.is_set():


try:
current_site, retry_count = url_queue.get(timeout=1) # 1 saniye
bekleme
last_active_time = [Link]() # URL bulunduğunda zamanı güncelle
except:
current_time = [Link]()
if (current_time - last_active_time) > THREAD_TIMEOUT_SECONDS:
[Link](f"Instance {instance_id} zaman aşımına uğradı. Thread
kapanıyor.")
break # Timeout oluştu, thread kapanıyor
if exit_event.is_set():
[Link](f"Instance {instance_id} kapanma sinyali aldı.
Kapanıyor.")
break
continue # Yeni URL beklemeye devam et

[Link](f"Instance {instance_id} işleniyor: {current_site} (Retry:


{retry_count})")
site_start_time = [Link]()

try:
try:
[Link](current_site)
normalized_site = normalize_url(current_site)
except TimeoutException as e:
[Link](f"Instance {instance_id} siteye gitme zaman aşımına
uğradı: {current_site}. Hata: {e}")
with error_urls_lock:
error_logins.append({"url": current_site, "error": "Page load
timed out."})
continue # Sonraki siteye geç

try:
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
[Link](f"Instance {instance_id} sayfa yüklendi:
{current_site}")
except TimeoutException as te:
[Link](f"Instance {instance_id} sayfa yükleme zaman aşımına
uğradı: {current_site}. Hata: {te}")
with error_urls_lock:
error_logins.append({"url": current_site, "error": "Page
loading timed out."})
continue # Sonraki siteye geç
except WebDriverException as we:
[Link](f"Instance {instance_id} WebDriver hatası:
{current_site}. Hata: {we}")
# WebDriver çökmesi durumunda URL'yi tekrar kuyruğa ekle
with error_urls_lock:
error_logins.append({"url": current_site, "error": str(we)})
if retry_count < MAX_RETRIES:
enqueue_url(url_queue, current_site, retry_count + 1)
[Link](f"Instance {instance_id}: {current_site} tekrar
kuyruğa eklendi (Retry: {retry_count + 1}).")
# Mevcut WebDriver'ı kapat ve yenisini başlat
try:
driver_instance.quit_driver()
except:
pass
try:
driver_instance = StandardChromeDriver(
site=base_domain, # Yeni WebDriver'ı aynı base domain ile
başlat
keywords=[],
login_pages=[],
instance_id=instance_id
)
driver = driver_instance.get_driver()
driver.set_page_load_timeout(120)
driver.set_script_timeout(120)
[Link](f"Instance {instance_id}: Yeni WebDriver
başlatıldı.")
except Exception as e:
[Link](f"Instance {instance_id}: Yeni WebDriver başlatma
hatası: {e}")
return # Yeni WebDriver başlatılamazsa thread'i sonlandır
continue # Sonraki siteye geç

# Login formunu tespit et


login_urls = []
detected_url = detect_login_in_site(driver_instance, current_site,
login_urls, instance_id)

if detected_url:
for url in login_urls:
driver_instance.save_login_page(url)
with output_lock:
detected_logins.append({
"url": url,
"detected_at": [Link]().isoformat()
})
else:
[Link](f"Instance {instance_id}: Login form bulunamadı:
{current_site}")

# Diğer işlemler
elements = driver_instance.detect_elements_with_login()
detected_links = [
urljoin(current_site, element.get_attribute('href') or
element.get_attribute('action') or
element.get_attribute('onclick'))
for element in elements
if (element.get_attribute('href') or
element.get_attribute('action') or element.get_attribute('onclick'))
]

if not detected_links:
[Link](f"**detect_elements_with_login** ilgili bağlantı
bulamadı: {current_site}")
else:
[Link](f"**process_detected_links: {current_site}")
process_detected_links(driver_instance, detected_links,
instance_id, url_queue, url_list, processed_urls, url_list_lock,
processed_urls_lock, keyword_manager)

# İşlem süresini kontrol et


if [Link]() - site_start_time > THREAD_TIMEOUT_SECONDS:
[Link](f"Instance {instance_id} site işleme süresi aşıldı:
{current_site}")
continue

except TimeoutException as te:


[Link](f"Instance {instance_id} siteyi işlerken zaman aşımına
uğradı: {current_site}. Hata: {te}")
with error_urls_lock:
error_logins.append({"url": current_site, "error":
"TimeoutException during processing."})
except WebDriverException as we:
[Link](f"Instance {instance_id} WebDriver hatası siteyi
işlerken: {current_site}. Hata: {we}")
# WebDriver çökmesi durumunda URL'yi tekrar kuyruğa ekle
with error_urls_lock:
error_logins.append({"url": current_site, "error": str(we)})
if retry_count < MAX_RETRIES:
enqueue_url(url_queue, current_site, retry_count + 1)
[Link](f"Instance {instance_id}: {current_site} tekrar
kuyruğa eklendi (Retry: {retry_count + 1}).")
# WebDriver'ı yeniden başlat
try:
driver_instance.quit_driver()
except:
pass
try:
driver_instance = StandardChromeDriver(
site=base_domain, # Yeni WebDriver'ı aynı base domain ile
başlat
keywords=[],
login_pages=[],
instance_id=instance_id
)
driver = driver_instance.get_driver()
driver.set_page_load_timeout(120)
driver.set_script_timeout(120)
[Link](f"Instance {instance_id}: Yeni WebDriver başlatıldı.")
except Exception as e:
[Link](f"Instance {instance_id}: Yeni WebDriver başlatma
hatası: {e}")
return # Yeni WebDriver başlatılamazsa thread'i sonlandır
except Exception as e:
[Link](f"Instance {instance_id} siteyi işlerken hata oluştu:
{current_site}. Hata: {e}")
with error_urls_lock:
error_logins.append({"url": current_site, "error": str(e)})
finally:
url_queue.task_done()

if driver_instance:
driver_instance.quit_driver()

def post_data_worker(post_queue, post_stop_event, post_url):


"""
Kuyuktan verileri alır ve 100'lü gruplar halinde postlar.
Her 100'lü gruptan sonra 5 dakika bekler.
"""
batch_size = 100
while not post_stop_event.is_set():
batch = []
try:
# Kuyruktan en fazla batch_size kadar veri al
while len(batch) < batch_size:
item = post_queue.get(timeout=1) # 1 saniye bekle
[Link](item)
except [Link]:
pass # Kuyruk boş, devam et

if batch:
for session_data in batch:
try:
response = [Link](url=post_url, verify=False,
json=session_data)
if response.status_code == 200:
[Link](f"Successfully posted session data for
{session_data['domain']} to {post_url}")
elif response.status_code == 400:
[Link](f"Invalid JSON format in the request body for
{session_data['domain']}: {response.status_code}")
elif response.status_code == 500:
[Link](f"Internal server error on test for
{session_data['domain']}: {response.status_code}")
else:
[Link](f"Something went wrong on test for
{session_data['domain']}: {response.status_code}")
except Exception as e:
[Link](f"Error posting session data for
{session_data['domain']}: {e}", exc_info=True)

# 100 postlandıktan sonra 5 dakika bekle


[Link]("Batch of 100 session data posted. Waiting for 5 minutes
before next batch.")
for _ in range(300): # 300 saniye = 5 dakika
if post_stop_event.is_set():
break
[Link](1)

else:
# Eğer kuyruğa veri eklenmediyse kısa bir süre bekle
[Link](5)

def start_threads(detected_logins, error_logins, output_lock, num_instances,


domain,
url_queue, url_list, processed_urls, error_urls, exit_event,
json_lock, error_urls_lock, url_list_lock, processed_urls_lock,
keyword_manager, threads):
"""
Thread'leri başlatır ve her bir thread'in başlamasını loglar.
"""
for i in range(num_instances):
instance_id = i + 1
thread_name = f"{domain}-Instance-{instance_id}"
thread = [Link](target=worker, args=(
detected_logins, error_logins, output_lock, instance_id,
url_queue, url_list, processed_urls, error_urls, exit_event,
json_lock, error_urls_lock, url_list_lock, processed_urls_lock,
keyword_manager, domain # base_domain ekleniyor
), name=thread_name)
[Link]()
[Link](thread)
[Link](f"{thread_name} başlatıldı.")
[Link](2) # Her thread arasına 2 saniye bekleme süresi ekler

def initial_detection(site, instance_id, detected_logins, error_logins,


output_lock,
url_queue, url_list, processed_urls, error_urls,
json_lock, error_urls_lock, url_list_lock,
processed_urls_lock, keyword_manager):
[Link](f"Başlangıç aşaması: Siteyi kontrol ediyor: {site}")

try:
driver_instance = StandardChromeDriver(
site=site,
keywords=[],
login_pages=[],
instance_id=instance_id
)

driver = driver_instance.get_driver()
if driver is None:
[Link](f"WebDriver başlatılamadı {instance_id}: {site}")
with error_urls_lock:
error_logins.append({"url": site, "error": "WebDriver could not be
started."})
return

[Link](f"Instance {instance_id} successfully get_driver : {site}")


try:
[Link](f"Instance {instance_id} try get : {site}")
[Link](site)
[Link](f"Instance {instance_id} successfully get : {site}")
normalized_site = normalize_url(site)
[Link](f"Instance {instance_id} successfully normalize {site}")
except WebDriverException as e:
[Link](f"Instance {instance_id} siteye gitme zaman aşımına
uğradı: {site}. Hata: {e}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": str(e)})
return
try:
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
[Link](f"Instance {instance_id} sayfa yüklendi: {site}")
except TimeoutException as te:
[Link](f"Instance {instance_id} sayfa yükleme zaman aşımına
uğradı: {site}. Hata: {te}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": "Page loading timed
out."})
return
except WebDriverException as we:
[Link](f"Instance {instance_id} WebDriver hatası: {site}. Hata:
{we}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": str(we)})
return

try:
with processed_urls_lock:
processed_urls.add(site)

login_urls = []
detected_url = detect_login_in_site(driver_instance, site, login_urls,
instance_id)

if detected_url:
for url in login_urls:
driver_instance.save_login_page(url)
with output_lock:
detected_logins.append({
"url": url,
"detected_at": [Link]().isoformat()
})
else:
[Link](f"Başlangıç aşamasında login formu bulunamadı:
{site}")

# detect_elements_with_login fonksiyonunu çalıştır


[Link](f"Primary detectors did not find a login form, starting
**detect_elements_with_login**: {site}")
elements = driver_instance.detect_elements_with_login()
detected_links = [
urljoin(site, element.get_attribute('href') or
element.get_attribute('action') or
element.get_attribute('onclick'))
for element in elements
if (element.get_attribute('href') or
element.get_attribute('action') or element.get_attribute('onclick'))
]

if not detected_links:
[Link](f"**detect_elements_with_login** ilgili bağlantı
bulamadı: {site}")
else:
process_detected_links(driver_instance, detected_links,
instance_id, url_queue, url_list, processed_urls, url_list_lock,
processed_urls_lock, keyword_manager)
except TimeoutException as te:
[Link](f"Instance {instance_id} siteyi işlerken zaman aşımına
uğradı: {site}. Hata: {te}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": "TimeoutException during
processing."})
except WebDriverException as we:
[Link](f"Instance {instance_id} WebDriver hatası siteyi
işlerken: {site}. Hata: {we}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": str(we)})
except Exception as e:
[Link](f"Instance {instance_id} siteyi işlerken hata oluştu:
{site}. Hata: {e}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": str(e)})

except Exception as e:
[Link](f"Instance {instance_id} istisna aldı: {e}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": str(e)})
finally:
driver_instance.quit_driver()

def setup_logging(log_directory):
[Link](log_directory, exist_ok=True)
log_file = f"login-page-finder.{[Link]().strftime('%d.%m.%y')}.log"
log_path = [Link](log_directory, log_file)

logger = [Link]()
[Link]([Link])

for handler in [Link][:]:


[Link](handler)

# RotatingFileHandler ile log dosyalarını rotasyona tabi tutun


file_handler = RotatingFileHandler(log_path, maxBytes=5*1024*1024,
backupCount=5) # 5 MB ve 5 yedek
file_handler.setLevel([Link])
formatter = [Link]('%(asctime)s - %(levelname)s - %(threadName)s - %
(message)s')
file_handler.setFormatter(formatter)
[Link](file_handler)

[Link](f"LOG_PATH: {log_path}")
return log_path

def parse_arguments():
parser = [Link](description="Login Page Finder")
parser.add_argument(help='host(s) to scan', dest='host', nargs='?')
parser.add_argument('--env', choices=['staging', 'dev', 'prod'],
help='Environment to use', dest='env', default='dev')
return parser.parse_args()

def main():
args = parse_arguments()
env = [Link]

if env not in ["staging", "dev", "prod"]:


[Link](f"Invalid environment: {env}")
[Link]("Options are: ['staging','dev','prod']")
exit()

num_instances = int([Link]("NUM_INSTANCES", "1"))


if num_instances <= 0:
print(f"NUM_INSTANCES değeri geçersiz: {num_instances}. 1 veya daha fazla
olmalı.")
[Link](f"NUM_INSTANCES değeri geçersiz: {num_instances}. 1 veya daha
fazla olmalı.")
exit()

log_directory = "log/"
detected_logs_directory = "detectedlogs/"
error_logs_directory = "errorlogs/"
setup_logging(log_directory)
[Link](detected_logs_directory, exist_ok=True)
[Link](error_logs_directory, exist_ok=True)

[Link](f"NUM_INSTANCES: {num_instances}")
THREAD_TIMEOUT_SECONDS = int([Link]("THREAD_TIMEOUT_SECONDS", "60")) # 60
saniye olarak ayarlandı
[Link](f"THREAD_TIMEOUT_SECONDS: {THREAD_TIMEOUT_SECONDS}")

initial_sites_dict = get_initial_sites(env)

if not initial_sites_dict:
[Link]("İşlenecek site bulunamadı.")
return

keyword_manager = KeywordManager()

# Bilinen login URL'lerini her domain için tanımlayın


present_known_domains = {
'[Link]': {
'[Link]
p=2&devam=2f796f7264616d2f3f703d302664696c3d33&dil=0#girisForm',
'[Link]
'[Link]
'[Link]
'[Link]
%C4%B1OturumuA%C3%A7ma'
},
# Diğer domainler için de ekleyebilirsiniz
}

# Postlama kuyruğunu ve iş parçacığını tanımlayın


global post_queue # post_session_data fonksiyonunda erişebilmek için global
tanımlandı
post_queue = Queue()
post_stop_event = [Link]()

# Postlama URL'sini ortam değişkeninden veya sabit bir değer olarak


alabilirsiniz
post_url = [Link]("POST_URL", "[Link]

# Postlama iş parçacığını başlatın


post_thread = [Link](target=post_data_worker, args=(post_queue,
post_stop_event, post_url), daemon=True)
post_thread.start()
[Link]("Post data worker thread started.")

# İnternet bağlantısı kontrolü için olaylar ve thread'ler


reconnect_event = [Link]()

for domain, subdomains in initial_sites_dict.items():


[Link](f"Domain için oturum başlatılıyor: {domain} ile subdomain'ler:
{subdomains}")

# Her domain için ayrı URL kuyruğu ve setler


url_queue = Queue()
processed_urls = set()
url_list = set()
error_urls = []
detected_logins_session = [] # Per-session detected_logins listesi
error_logins_session = [] # Per-session hata listesi

json_lock = [Link]()
error_urls_lock = [Link]()
processed_urls_lock = [Link]()
url_list_lock = [Link]()
output_lock = [Link]()

exit_event = [Link]()
threads = []

# Signal handler'ı kaydedin


[Link]([Link], lambda sig, frame: signal_handler(sig, frame,
exit_event, threads))

# Normalize domain
normalized_domain = normalize_url(domain)
if normalized_domain and is_valid_url(normalized_domain, keyword_manager):
with url_list_lock:
if normalized_domain not in url_list and normalized_domain not in
processed_urls:
enqueue_url(url_queue, normalized_domain) # retry_count
default 0
url_list.add(normalized_domain)
processed_urls.add(normalized_domain)
else:
[Link](f"Skipping initial domain due to invalid URL or unwanted
extension: {domain}")

# Normalize ve enqueue subdomains


for subdomain in subdomains:
normalized_subdomain = normalize_url(subdomain)
if normalized_subdomain and is_valid_url(normalized_subdomain,
keyword_manager):
with url_list_lock:
if normalized_subdomain not in url_list and
normalized_subdomain not in processed_urls:
enqueue_url(url_queue, normalized_subdomain) # retry_count
default 0
url_list.add(normalized_subdomain)
processed_urls.add(normalized_subdomain)
else:
[Link](f"Skipping subdomain due to invalid URL or unwanted
extension: {subdomain}")

# Initial detection runs in the main thread


initial_detection(normalized_domain, 0, detected_logins_session,
error_logins_session, output_lock,
url_queue, url_list, processed_urls, error_urls,
json_lock, error_urls_lock, url_list_lock,
processed_urls_lock, keyword_manager)

# Start worker threads


start_threads(detected_logins_session, error_logins_session, output_lock,
num_instances, domain,
url_queue, url_list, processed_urls, error_urls, exit_event,
json_lock, error_urls_lock, url_list_lock,
processed_urls_lock, keyword_manager, threads)

# İnternet monitörü thread'i başlat


monitor_thread = [Link](target=internet_monitor, args=(
exit_event, reconnect_event, url_queue, detected_logins_session,
error_logins_session,
json_lock, error_urls_lock, url_list_lock, processed_urls_lock,
keyword_manager, domain
), name=f"{domain}-InternetMonitor")
monitor_thread.start()

try:
while not exit_event.is_set():
if reconnect_event.is_set():
# İnternet bağlantısı kesildi, thread'leri kapat ve kuyruğu
bekle
[Link]("İnternet bağlantısı kesildi, thread'ler
kapatılıyor.")
for thread in threads:
[Link](timeout=1)
[Link]()

# Bağlantı geri geldiğinde thread'leri yeniden başlat


if check_internet_connection():
[Link]("İnternet bağlantısı geri geldi, thread'ler
yeniden başlatılıyor.")
start_threads(detected_logins_session,
error_logins_session, output_lock, num_instances, domain,
url_queue, url_list, processed_urls,
error_urls, exit_event,
json_lock, error_urls_lock, url_list_lock,
processed_urls_lock, keyword_manager, threads)
reconnect_event.clear()
[Link](1)
except KeyboardInterrupt:
[Link]("Ana süreçte KeyboardInterrupt alındı. Shutdown
başlatılıyor...")
signal_handler(None, None, exit_event, threads)

# Worker thread'lerini bekleyin


for thread in threads:
[Link]()

monitor_thread.join()
# Bilinen login URL'lerini sonuçlara ekleyin
known_logins = present_known_domains.get(domain, set())
if known_logins:
with output_lock:
for login_url in known_logins:
detected_logins_session.append({
"url": login_url,
"detected_at": [Link]().isoformat(),
"source": "present_known_domains"
})
[Link](f"Added known login URLs for {domain} to detected
logins.")

# Detected logins JSON dosyasını yazın


if detected_logins_session:
detected_json_file = [Link](detected_logs_directory,
f"{[Link]('.', '_')}_detected_logins.json")
write_detected_logins_to_json(domain, detected_logins_session,
detected_json_file, json_lock)

# Prepare the session data


session_data = {
"domain": domain,
"detected_logins": detected_logins_session
}
# Post the session data by enqueueing it
post_session_data(env, session_data)

# Error loglarını JSON dosyasına yazın


if error_logins_session:
error_json_file = [Link](error_logs_directory,
f"{[Link]('.', '_')}_errors.json")
write_errors_to_json(domain, error_logins_session, error_json_file,
json_lock)

# Per-session verilerini temizleyin


with error_urls_lock:
error_logins_session.clear()

with url_list_lock:
url_list.clear()
[Link](f"url_list temizlendi: {domain} için.")

with processed_urls_lock:
processed_urls.clear()
[Link](f"processed_urls temizlendi: {domain} için.")

[Link]()
[Link](f"Thread listesi temizlendi: {domain} için.")

[Link](f"Tüm siteler işlendi ve {domain} için JSON dosyalarına


yazıldı.")

# Tüm domain'ler işlendiğinde

# Postlama kuyruğunu kapatın ve iş parçacığını durdurun


post_stop_event.set()
post_thread.join()
[Link]("Post data worker thread stopped.")
if __name__ == "__main__":
main()

# keyword_manager.py
# You can choose the word lists which the program uses in its formulas.
# For example, when the program detects a page that is not a login page, you can
delete the keyword that caused the program to detect that page as a login page.
# The program uses login_keywords, username_keywords, password_keywords,
submit_keywords for detection.
# The program does not use common_login_paths and angular_keywords; you can pass
them.
# You can decide unwanted extensions via unwanted_extension_keywords because you
know that PDFs do not have login pages.
# You can decide unwanted keywords via unwanted_extension_keywords. Thanks to this
list, we can block specific URLs; for example, you can block [Link]
but still be able to connect to [Link]
# You can ban an entire domain via unwanted_domains.
# present_known_domains is a development part of the code; it's not finished yet.
# I am trying to ensure that when the program works on a domain, it automatically
adds that domain to detected logins, so we can also add banned domains' login pages
to detected logins.

from typing import List

class KeywordManager:
def __init__(self):
self.login_keywords = [ #keywords about login, sign in, etc...
'login', 'signin', 'giriş', 'oturum aç', 'kullanıcı girişi','login-2',
'log in', 'sign in', 'entrance', 'access', 'member', 'authenticate',
'logon', 'connect', 'connexion', 'account', 'user', 'signon',
'iniciar sesión', '登录', '注册', 'вход', 'войти', 'accedi', 'anmelden',
'se connecter', 'entrar', 'ログイン', 'サインイン', 'hesap', 'kaydol', 'üye
ol,',
' Müşteri No / T.C. Kimlik No', 'Müşteri No,','T.C. Kimlik No'
]
self.username_keywords = [ #keywords about username, üye ol, etc...
'user', 'username', 'email', 'e-mail', 'kullanıcı', 'e-posta', 'e-posta
adresi', 'mail',
'correo', 'usuario', 'ユーザー', 'メールアドレス', '用户名', '电子邮件', '账户',
'utilisateur', 'benutzername',
'customerOrIdentityNumber', 'üye ol', 'Müşteri No / T.C. Kimlik No',
'Müşteri No',
'T.C. Kimlik No', 'customerNumber', 'userNumber', 'müşteri no',
'kullanıcı no','userNameInput' ,'User Account','someone@[Link]',
'kunyeYayinlayan'
]

self.password_keywords = [
'password', 'pass', 'şifre', 'parola', 'contraseña', 'mot de passe',
'passwort', '密码', 'パスワード', 'пароль', 'senha', 'sifre','sifre',
'şifrə','passwordInput'
]
self.submit_keywords = [
'login', 'signin', 'giriş', 'oturum aç', 'log in', 'sign in',
'send', 'enter', 'continue', 'logowanie', 'accedi',
'Yeni Şifre Al / Şifremi Unuttum','Şifremi Unuttum', 'Yeni Şifre Al',
]
self.common_login_paths = [
'/login', '/signin', '/user/login', '/account/login', '/admin/login',
'/[Link]', '/auth/login', '/[Link]', '/giris', '/oturum-ac',
'login-2', 'login-2', 'pttweb',
'/auth/register'
]
self.angular_keywords = [
'ng-app', 'ng-controller', '__ngContext__', '@angular', 'ng-version',
'[Link]', 'ngIf', 'ngFor', 'ngModel', 'ngClass', 'ngStyle',
'data-ng-app', 'data-ng-controller', 'ng-bind', 'ng-repeat'
]
self.unwanted_extension_keywords = {
'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.ico', '.pdf',
'.zip', '.doc' , '.docx' , '.xls' , '.xlsx'
}
self.unwanted_keywords = {
'video-id', 'photos',
'kutuphane','baslatAnket','baslatSanalTur','duyuru','haber','ekotaban','saybis'
}
self.unwanted_domains = {
'[Link]',
'[Link]',
'[Link]
'[Link]
'[Link]
'[Link]
'[Link]
'[Link]
}
self.present_known_domains ={
'[Link]
p=2&devam=2f796f7264616d2f3f703d302664696c3d33&dil=0#girisForm',
'[Link]
'[Link]
'[Link]
'[Link]
'[Link]
%C4%B1OturumuA%C3%A7ma'
}

def get_login_keywords(self) -> List[str]:


return self.login_keywords

def get_username_keywords(self) -> List[str]:


return self.username_keywords

def get_password_keywords(self) -> List[str]:


return self.password_keywords

def get_submit_keywords(self) -> List[str]:


return self.submit_keywords

def get_common_login_paths(self) -> List[str]:


return self.common_login_paths

def get_angular_keywords(self) -> List[str]:


return self.angular_keywords

def get_unwanted_extension_keywords(self) -> List[str]:


return list(self.unwanted_extension_keywords)

def get_unwanted_keywords(self) -> List[str]:


return list(self.unwanted_keywords)

def get_unwanted_domains(self) -> List[str]:


return list(self.unwanted_domains)

# [Link]

import argparse
import logging
import os
import time
import json
import signal
import sys
from datetime import datetime
import requests
import tldextract

from collections import defaultdict


from [Link] import WebDriverWait
from [Link] import expected_conditions as EC
from [Link] import By
from [Link] import TimeoutException, WebDriverException
from browser_driver import StandardChromeDriver
from keyword_manager import KeywordManager
from [Link] import urlparse, urlunparse, urljoin
from dotenv import load_dotenv

import threading
from queue import Queue
from [Link] import RotatingFileHandler

# Yeni eklenen modüller


import queue
import threading
import time

load_dotenv(override=True)

# Maksimum deneme sayısı


MAX_RETRIES = 3

# Yardımcı Fonksiyonlar

def is_valid_url(url, keyword_manager):


"""
URL'nin istenmeyen dosya uzantısıyla bitip bitmediğini, istenmeyen anahtar
kelimeler içerip içermediğini veya istenmeyen domain olup olmadığını kontrol eder.
"""
unwanted_extension_keywords = keyword_manager.get_unwanted_extension_keywords()
unwanted_keywords = keyword_manager.get_unwanted_keywords()
unwanted_domains = keyword_manager.get_unwanted_domains()

parsed = urlparse(url)
path = [Link]()
netloc = [Link]()

[Link](f"URL Kontrolü: {url}")


[Link](f"Path: {path}, Netloc: {netloc}")

# URL extension check


if any([Link](ext) for ext in unwanted_extension_keywords):
[Link](f"URL '{url}' is invalid due to unwanted extension in path.")
return False

# Keywords in path check


if any(keyword in path for keyword in unwanted_keywords):
[Link](f"URL '{url}' is invalid due to unwanted keyword in path.")
return False

# Unwanted domains check


if any(netloc == domain for domain in unwanted_domains):
[Link](f"URL '{url}' is invalid because it's in the unwanted domains
list.")
return False

[Link](f"URL '{url}' geçerli.")


return True

def normalize_url(url):
"""
Normalize URL by adding https:// or removing fragment
"""
try:
parsed_url = urlparse(url)
if not parsed_url.scheme:
# If there is no scheme, assume '[Link] and add it
url = '[Link] + url
parsed_url = urlparse(url)
elif parsed_url.scheme not in ('http', 'https'):
# If the scheme is invalid, return None
return None

parsed_url = parsed_url._replace(fragment='') # Remove the fragment


normalized_url = urlunparse(parsed_url)
return normalized_url
except Exception as e:
[Link](f"URL normalization failed: {url}. Error: {e}")
return None

def make_array_to_dic(url_listesi):
"""
Converts a list of URLs into a dictionary grouped by main domains.

:param url_listesi: List of URLs containing domain and subdomains


:return: Dictionary grouped by main domains
"""
domain_sozluk = defaultdict(list)

for url in url_listesi:


extracted = [Link](url)
# Create main domain (domain + suffix)
ana_domain = f"{[Link]}.{[Link]}".lower()
domain_sozluk[ana_domain].append(url)
return dict(domain_sozluk)

def detect_login_in_site(driver_instance, site, login_urls, instance_id):


"""
Function which run detectors in browser_driver
"""
[Link](f"Instance {instance_id} checking for login form on site: {site}")

detectors = [
("Login Form Detector", driver_instance.detect_login_form),
("Submit Button Detector", driver_instance.detect_submit_button),
("Input Field Detector", driver_instance.detect_input_fields_with_login),
("Shadow DOM Detector", driver_instance.detect_shadow_dom_with_login)
]

detected_url = None
for detector_name, detector_func in detectors:
detected_url = detector_func(site)
if detected_url:
[Link](f"Login form detected by **{detector_name}** on site:
{detected_url}")
login_urls.append(detected_url)
break
else:
try:
[Link](f"**{detector_name}** could not find a form on site:
{site}")
except OSError as e:
# Loglama hatasını ele alıyoruz
print(f"Loglama hatası: {e}")

return detected_url

def process_detected_links(driver_instance, detected_links, instance_id, url_queue,


url_list, processed_urls, url_list_lock, processed_urls_lock, keyword_manager):
for link in detected_links:
normalized_link = normalize_url(link)
if not normalized_link:
continue

if not is_valid_url(normalized_link, keyword_manager):


[Link](f"Skipping URL with unwanted extension or keyword:
{normalized_link}")
with processed_urls_lock:
processed_urls.add(normalized_link)
continue

if not driver_instance.check_url_matches_base(normalized_link):
with processed_urls_lock:
processed_urls.add(normalized_link)
continue

with url_list_lock:
if normalized_link not in url_list and normalized_link not in
processed_urls:
enqueue_url(url_queue, normalized_link) # retry_count default 0
url_list.add(normalized_link)
def enqueue_url(url_queue, url, retry_count=0):
url_queue.put((url, retry_count))

def get_initial_sites(env):
##hidden

initial_sites_dic = make_array_to_dic(initial_sites)

return initial_sites_dic

def write_detected_logins_to_json(domain, detected_logins, detected_json_file,


json_lock):
data = {
"domain": domain,
"detected_logins": detected_logins,
"detected_at": [Link]().isoformat()
}
try:
with json_lock:
with open(detected_json_file, 'w') as file:
[Link](data, file, indent=4)
[Link](f"Detected logins for {domain} saved to
{detected_json_file}.")
except Exception as e:
[Link](f"Error writing detected logins JSON file
{detected_json_file} for {domain}: {e}", exc_info=True)

def write_errors_to_json(domain, error_logins, error_json_file, json_lock):


data = {
"domain": domain,
"errors": error_logins,
"detected_at": [Link]().isoformat()
}
try:
with json_lock:
with open(error_json_file, 'w') as file:
[Link](data, file, indent=4)
[Link](f"Errors for {domain} saved to {error_json_file}.")
except Exception as e:
[Link](f"Error writing errors JSON file {error_json_file} for
{domain}: {e}", exc_info=True)

response = [Link](url=post_url, verify=False, json=session_data)

def post_session_data(env, session_data):

if env == "staging":
post_url = "hidden"
elif env == "dev":
post_url = "hidden"
elif env == "prod":
post_url = "hidden"

try:
post_queue.put(session_data)
[Link](f"Session data for {session_data['domain']} enqueued for
posting.")
except Exception as e:
[Link](f"Error enqueuing session data for {session_data['domain']}:
{e}", exc_info=True)

def check_internet_connection():
"""
Basit bir internet bağlantısı kontrolü.
"""
try:
[Link]('[Link] timeout=5)
return True
except [Link]:
return False

def signal_handler(sig, frame, exit_event, threads):


[Link]("CTRL + C detected! Closing all threads...")
exit_event.set()
# Wait for all threads to finish
for thread in threads:
[Link](timeout=1)
[Link]("All threads have been signaled to exit.")
[Link](0)

def internet_monitor(exit_event, reconnect_event, url_queue, detected_logins,


error_logins, json_lock, error_urls_lock, url_list_lock, processed_urls_lock,
keyword_manager, domain):
"""
İnternet bağlantısını izler ve bağlantı sağlandığında yeniden başlatır.
"""
while not exit_event.is_set():
if not check_internet_connection():
[Link]("İnternet bağlantısı kesildi.")
# Bağlantı kesildiğinde çıkış olayını tetikle
reconnect_event.set()
# Bağlantı geri gelene kadar bekle
while not check_internet_connection():
if exit_event.is_set():
break
[Link]("Bağlantı sağlanamadı, 5 dakika sonra tekrar
denenecek.")
[Link](300) # 5 dakika bekle
if not exit_event.is_set():
[Link]("İnternet bağlantısı geri geldi. Thread'ler yeniden
başlatılıyor.")
reconnect_event.clear()
[Link](60) # Her dakika kontrol et

def worker(detected_logins, error_logins, output_lock, instance_id,


url_queue, url_list, processed_urls, error_urls, exit_event, json_lock,
error_urls_lock, url_list_lock, processed_urls_lock, keyword_manager,
base_domain):
THREAD_TIMEOUT_SECONDS = 60 # 1 dakika olarak ayarlandı
last_active_time = [Link]()
driver_instance = None
driver = None

# WebDriver'ı başlatma
try:
driver_instance = StandardChromeDriver(
site=base_domain, # Base domain bilgisi ile başlatılıyor
keywords=[],
login_pages=[],
instance_id=instance_id
)
driver = driver_instance.get_driver()
if driver is None:
[Link](f"Instance {instance_id}: WebDriver başlatılamadı.")
return # WebDriver başlatılamazsa thread'i sonlandır
# Timeout ayarları
driver.set_page_load_timeout(120) # Sayfa yükleme için 120 saniye
driver.set_script_timeout(120) # Script çalıştırma için 120 saniye
except Exception as e:
[Link](f"Instance {instance_id}: WebDriver başlatma hatası: {e}")
return # WebDriver başlatılamazsa thread'i sonlandır

[Link](f"{base_domain}-Instance-{instance_id} thread'i başladı.")

while not exit_event.is_set():


try:
current_site, retry_count = url_queue.get(timeout=1) # 1 saniye
bekleme
last_active_time = [Link]() # URL bulunduğunda zamanı güncelle
except:
current_time = [Link]()
if (current_time - last_active_time) > THREAD_TIMEOUT_SECONDS:
[Link](f"Instance {instance_id} zaman aşımına uğradı. Thread
kapanıyor.")
break # Timeout oluştu, thread kapanıyor
if exit_event.is_set():
[Link](f"Instance {instance_id} kapanma sinyali aldı.
Kapanıyor.")
break
continue # Yeni URL beklemeye devam et

[Link](f"Instance {instance_id} işleniyor: {current_site} (Retry:


{retry_count})")
site_start_time = [Link]()

try:
try:
[Link](current_site)
normalized_site = normalize_url(current_site)
except TimeoutException as e:
[Link](f"Instance {instance_id} siteye gitme zaman aşımına
uğradı: {current_site}. Hata: {e}")
with error_urls_lock:
error_logins.append({"url": current_site, "error": "Page load
timed out."})
continue # Sonraki siteye geç

try:
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
[Link](f"Instance {instance_id} sayfa yüklendi:
{current_site}")
except TimeoutException as te:
[Link](f"Instance {instance_id} sayfa yükleme zaman aşımına
uğradı: {current_site}. Hata: {te}")
with error_urls_lock:
error_logins.append({"url": current_site, "error": "Page
loading timed out."})
continue # Sonraki siteye geç
except WebDriverException as we:
[Link](f"Instance {instance_id} WebDriver hatası:
{current_site}. Hata: {we}")
# WebDriver çökmesi durumunda URL'yi tekrar kuyruğa ekle
with error_urls_lock:
error_logins.append({"url": current_site, "error": str(we)})
if retry_count < MAX_RETRIES:
enqueue_url(url_queue, current_site, retry_count + 1)
[Link](f"Instance {instance_id}: {current_site} tekrar
kuyruğa eklendi (Retry: {retry_count + 1}).")
# Mevcut WebDriver'ı kapat ve yenisini başlat
try:
driver_instance.quit_driver()
except:
pass
try:
driver_instance = StandardChromeDriver(
site=base_domain, # Yeni WebDriver'ı aynı base domain ile
başlat
keywords=[],
login_pages=[],
instance_id=instance_id
)
driver = driver_instance.get_driver()
driver.set_page_load_timeout(120)
driver.set_script_timeout(120)
[Link](f"Instance {instance_id}: Yeni WebDriver
başlatıldı.")
except Exception as e:
[Link](f"Instance {instance_id}: Yeni WebDriver başlatma
hatası: {e}")
return # Yeni WebDriver başlatılamazsa thread'i sonlandır
continue # Sonraki siteye geç

# Login formunu tespit et


login_urls = []
detected_url = detect_login_in_site(driver_instance, current_site,
login_urls, instance_id)

if detected_url:
for url in login_urls:
driver_instance.save_login_page(url)
with output_lock:
detected_logins.append({
"url": url,
"detected_at": [Link]().isoformat()
})
else:
[Link](f"Instance {instance_id}: Login form bulunamadı:
{current_site}")

# Diğer işlemler
elements = driver_instance.detect_elements_with_login()
detected_links = [
urljoin(current_site, element.get_attribute('href') or
element.get_attribute('action') or
element.get_attribute('onclick'))
for element in elements
if (element.get_attribute('href') or
element.get_attribute('action') or element.get_attribute('onclick'))
]

if not detected_links:
[Link](f"**detect_elements_with_login** ilgili bağlantı
bulamadı: {current_site}")
else:
[Link](f"**process_detected_links: {current_site}")
process_detected_links(driver_instance, detected_links,
instance_id, url_queue, url_list, processed_urls, url_list_lock,
processed_urls_lock, keyword_manager)

# İşlem süresini kontrol et


if [Link]() - site_start_time > THREAD_TIMEOUT_SECONDS:
[Link](f"Instance {instance_id} site işleme süresi aşıldı:
{current_site}")
continue

except TimeoutException as te:


[Link](f"Instance {instance_id} siteyi işlerken zaman aşımına
uğradı: {current_site}. Hata: {te}")
with error_urls_lock:
error_logins.append({"url": current_site, "error":
"TimeoutException during processing."})
except WebDriverException as we:
[Link](f"Instance {instance_id} WebDriver hatası siteyi
işlerken: {current_site}. Hata: {we}")
# WebDriver çökmesi durumunda URL'yi tekrar kuyruğa ekle
with error_urls_lock:
error_logins.append({"url": current_site, "error": str(we)})
if retry_count < MAX_RETRIES:
enqueue_url(url_queue, current_site, retry_count + 1)
[Link](f"Instance {instance_id}: {current_site} tekrar
kuyruğa eklendi (Retry: {retry_count + 1}).")
# WebDriver'ı yeniden başlat
try:
driver_instance.quit_driver()
except:
pass
try:
driver_instance = StandardChromeDriver(
site=base_domain, # Yeni WebDriver'ı aynı base domain ile
başlat
keywords=[],
login_pages=[],
instance_id=instance_id
)
driver = driver_instance.get_driver()
driver.set_page_load_timeout(120)
driver.set_script_timeout(120)
[Link](f"Instance {instance_id}: Yeni WebDriver başlatıldı.")
except Exception as e:
[Link](f"Instance {instance_id}: Yeni WebDriver başlatma
hatası: {e}")
return # Yeni WebDriver başlatılamazsa thread'i sonlandır
except Exception as e:
[Link](f"Instance {instance_id} siteyi işlerken hata oluştu:
{current_site}. Hata: {e}")
with error_urls_lock:
error_logins.append({"url": current_site, "error": str(e)})
finally:
url_queue.task_done()

if driver_instance:
driver_instance.quit_driver()

def post_data_worker(post_queue, post_stop_event, post_url):


"""
Kuyuktan verileri alır ve 100'lü gruplar halinde postlar.
Her 100'lü gruptan sonra 5 dakika bekler.
"""
batch_size = 100
while not post_stop_event.is_set():
batch = []
try:
# Kuyruktan en fazla batch_size kadar veri al
while len(batch) < batch_size:
item = post_queue.get(timeout=1) # 1 saniye bekle
[Link](item)
except [Link]:
pass # Kuyruk boş, devam et

if batch:
for session_data in batch:
try:
response = [Link](url=post_url, verify=False,
json=session_data)
if response.status_code == 200:
[Link](f"Successfully posted session data for
{session_data['domain']} to {post_url}")
elif response.status_code == 400:
[Link](f"Invalid JSON format in the request body for
{session_data['domain']}: {response.status_code}")
elif response.status_code == 500:
[Link](f"Internal server error on test for
{session_data['domain']}: {response.status_code}")
else:
[Link](f"Something went wrong on test for
{session_data['domain']}: {response.status_code}")
except Exception as e:
[Link](f"Error posting session data for
{session_data['domain']}: {e}", exc_info=True)

# 100 postlandıktan sonra 5 dakika bekle


[Link]("Batch of 100 session data posted. Waiting for 5 minutes
before next batch.")
for _ in range(300): # 300 saniye = 5 dakika
if post_stop_event.is_set():
break
[Link](1)

else:
# Eğer kuyruğa veri eklenmediyse kısa bir süre bekle
[Link](5)

def start_threads(detected_logins, error_logins, output_lock, num_instances,


domain,
url_queue, url_list, processed_urls, error_urls, exit_event,
json_lock, error_urls_lock, url_list_lock, processed_urls_lock,
keyword_manager, threads):
"""
Thread'leri başlatır ve her bir thread'in başlamasını loglar.
"""
for i in range(num_instances):
instance_id = i + 1
thread_name = f"{domain}-Instance-{instance_id}"
thread = [Link](target=worker, args=(
detected_logins, error_logins, output_lock, instance_id,
url_queue, url_list, processed_urls, error_urls, exit_event,
json_lock, error_urls_lock, url_list_lock, processed_urls_lock,
keyword_manager, domain # base_domain ekleniyor
), name=thread_name)
[Link]()
[Link](thread)
[Link](f"{thread_name} başlatıldı.")
[Link](2) # Her thread arasına 2 saniye bekleme süresi ekler

def initial_detection(site, instance_id, detected_logins, error_logins,


output_lock,
url_queue, url_list, processed_urls, error_urls,
json_lock, error_urls_lock, url_list_lock,
processed_urls_lock, keyword_manager):
[Link](f"Başlangıç aşaması: Siteyi kontrol ediyor: {site}")

try:
driver_instance = StandardChromeDriver(
site=site,
keywords=[],
login_pages=[],
instance_id=instance_id
)

driver = driver_instance.get_driver()
if driver is None:
[Link](f"WebDriver başlatılamadı {instance_id}: {site}")
with error_urls_lock:
error_logins.append({"url": site, "error": "WebDriver could not be
started."})
return

[Link](f"Instance {instance_id} successfully get_driver : {site}")


try:
[Link](f"Instance {instance_id} try get : {site}")
[Link](site)
[Link](f"Instance {instance_id} successfully get : {site}")
normalized_site = normalize_url(site)
[Link](f"Instance {instance_id} successfully normalize {site}")
except WebDriverException as e:
[Link](f"Instance {instance_id} siteye gitme zaman aşımına
uğradı: {site}. Hata: {e}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": str(e)})
return
try:
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
[Link](f"Instance {instance_id} sayfa yüklendi: {site}")
except TimeoutException as te:
[Link](f"Instance {instance_id} sayfa yükleme zaman aşımına
uğradı: {site}. Hata: {te}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": "Page loading timed
out."})
return
except WebDriverException as we:
[Link](f"Instance {instance_id} WebDriver hatası: {site}. Hata:
{we}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": str(we)})
return

try:
with processed_urls_lock:
processed_urls.add(site)

login_urls = []
detected_url = detect_login_in_site(driver_instance, site, login_urls,
instance_id)

if detected_url:
for url in login_urls:
driver_instance.save_login_page(url)
with output_lock:
detected_logins.append({
"url": url,
"detected_at": [Link]().isoformat()
})
else:
[Link](f"Başlangıç aşamasında login formu bulunamadı:
{site}")

# detect_elements_with_login fonksiyonunu çalıştır


[Link](f"Primary detectors did not find a login form, starting
**detect_elements_with_login**: {site}")
elements = driver_instance.detect_elements_with_login()
detected_links = [
urljoin(site, element.get_attribute('href') or
element.get_attribute('action') or
element.get_attribute('onclick'))
for element in elements
if (element.get_attribute('href') or
element.get_attribute('action') or element.get_attribute('onclick'))
]
if not detected_links:
[Link](f"**detect_elements_with_login** ilgili bağlantı
bulamadı: {site}")
else:
process_detected_links(driver_instance, detected_links,
instance_id, url_queue, url_list, processed_urls, url_list_lock,
processed_urls_lock, keyword_manager)

except TimeoutException as te:


[Link](f"Instance {instance_id} siteyi işlerken zaman aşımına
uğradı: {site}. Hata: {te}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": "TimeoutException during
processing."})
except WebDriverException as we:
[Link](f"Instance {instance_id} WebDriver hatası siteyi
işlerken: {site}. Hata: {we}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": str(we)})
except Exception as e:
[Link](f"Instance {instance_id} siteyi işlerken hata oluştu:
{site}. Hata: {e}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": str(e)})

except Exception as e:
[Link](f"Instance {instance_id} istisna aldı: {e}", exc_info=True)
with error_urls_lock:
error_logins.append({"url": site, "error": str(e)})
finally:
driver_instance.quit_driver()

def setup_logging(log_directory):
[Link](log_directory, exist_ok=True)
log_file = f"login-page-finder.{[Link]().strftime('%d.%m.%y')}.log"
log_path = [Link](log_directory, log_file)

logger = [Link]()
[Link]([Link])

for handler in [Link][:]:


[Link](handler)

# RotatingFileHandler ile log dosyalarını rotasyona tabi tutun


file_handler = RotatingFileHandler(log_path, maxBytes=5*1024*1024,
backupCount=5) # 5 MB ve 5 yedek
file_handler.setLevel([Link])
formatter = [Link]('%(asctime)s - %(levelname)s - %(threadName)s - %
(message)s')
file_handler.setFormatter(formatter)
[Link](file_handler)

[Link](f"LOG_PATH: {log_path}")
return log_path

def parse_arguments():
parser = [Link](description="Login Page Finder")
parser.add_argument(help='host(s) to scan', dest='host', nargs='?')
parser.add_argument('--env', choices=['staging', 'dev', 'prod'],
help='Environment to use', dest='env', default='dev')
return parser.parse_args()

def main():
args = parse_arguments()
env = [Link]

if env not in ["staging", "dev", "prod"]:


[Link](f"Invalid environment: {env}")
[Link]("Options are: ['staging','dev','prod']")
exit()

num_instances = int([Link]("NUM_INSTANCES", "1"))


if num_instances <= 0:
print(f"NUM_INSTANCES değeri geçersiz: {num_instances}. 1 veya daha fazla
olmalı.")
[Link](f"NUM_INSTANCES değeri geçersiz: {num_instances}. 1 veya daha
fazla olmalı.")
exit()

log_directory = "log/"
detected_logs_directory = "detectedlogs/"
error_logs_directory = "errorlogs/"
setup_logging(log_directory)
[Link](detected_logs_directory, exist_ok=True)
[Link](error_logs_directory, exist_ok=True)

[Link](f"NUM_INSTANCES: {num_instances}")
THREAD_TIMEOUT_SECONDS = int([Link]("THREAD_TIMEOUT_SECONDS", "60")) # 60
saniye olarak ayarlandı
[Link](f"THREAD_TIMEOUT_SECONDS: {THREAD_TIMEOUT_SECONDS}")

initial_sites_dict = get_initial_sites(env)

if not initial_sites_dict:
[Link]("İşlenecek site bulunamadı.")
return

keyword_manager = KeywordManager()

# Bilinen login URL'lerini her domain için tanımlayın


present_known_domains = {
'[Link]': {
'[Link]
p=2&devam=2f796f7264616d2f3f703d302664696c3d33&dil=0#girisForm',
'[Link]
'[Link]
'[Link]
'[Link]
%C4%B1OturumuA%C3%A7ma'
},
# Diğer domainler için de ekleyebilirsiniz
}

# Postlama kuyruğunu ve iş parçacığını tanımlayın


global post_queue # post_session_data fonksiyonunda erişebilmek için global
tanımlandı
post_queue = Queue()
post_stop_event = [Link]()

# Postlama URL'sini ortam değişkeninden veya sabit bir değer olarak


alabilirsiniz
post_url = [Link]("POST_URL", "[Link]

# Postlama iş parçacığını başlatın


post_thread = [Link](target=post_data_worker, args=(post_queue,
post_stop_event, post_url), daemon=True)
post_thread.start()
[Link]("Post data worker thread started.")

# İnternet bağlantısı kontrolü için olaylar ve thread'ler


reconnect_event = [Link]()

for domain, subdomains in initial_sites_dict.items():


[Link](f"Domain için oturum başlatılıyor: {domain} ile subdomain'ler:
{subdomains}")

# Her domain için ayrı URL kuyruğu ve setler


url_queue = Queue()
processed_urls = set()
url_list = set()
error_urls = []
detected_logins_session = [] # Per-session detected_logins listesi
error_logins_session = [] # Per-session hata listesi

json_lock = [Link]()
error_urls_lock = [Link]()
processed_urls_lock = [Link]()
url_list_lock = [Link]()
output_lock = [Link]()

exit_event = [Link]()
threads = []

# Signal handler'ı kaydedin


[Link]([Link], lambda sig, frame: signal_handler(sig, frame,
exit_event, threads))

# Normalize domain
normalized_domain = normalize_url(domain)
if normalized_domain and is_valid_url(normalized_domain, keyword_manager):
with url_list_lock:
if normalized_domain not in url_list and normalized_domain not in
processed_urls:
enqueue_url(url_queue, normalized_domain) # retry_count
default 0
url_list.add(normalized_domain)
processed_urls.add(normalized_domain)
else:
[Link](f"Skipping initial domain due to invalid URL or unwanted
extension: {domain}")

# Normalize ve enqueue subdomains


for subdomain in subdomains:
normalized_subdomain = normalize_url(subdomain)
if normalized_subdomain and is_valid_url(normalized_subdomain,
keyword_manager):
with url_list_lock:
if normalized_subdomain not in url_list and
normalized_subdomain not in processed_urls:
enqueue_url(url_queue, normalized_subdomain) # retry_count
default 0
url_list.add(normalized_subdomain)
processed_urls.add(normalized_subdomain)
else:
[Link](f"Skipping subdomain due to invalid URL or unwanted
extension: {subdomain}")

# Initial detection runs in the main thread


initial_detection(normalized_domain, 0, detected_logins_session,
error_logins_session, output_lock,
url_queue, url_list, processed_urls, error_urls,
json_lock, error_urls_lock, url_list_lock,
processed_urls_lock, keyword_manager)

# Start worker threads


start_threads(detected_logins_session, error_logins_session, output_lock,
num_instances, domain,
url_queue, url_list, processed_urls, error_urls, exit_event,
json_lock, error_urls_lock, url_list_lock,
processed_urls_lock, keyword_manager, threads)

# İnternet monitörü thread'i başlat


monitor_thread = [Link](target=internet_monitor, args=(
exit_event, reconnect_event, url_queue, detected_logins_session,
error_logins_session,
json_lock, error_urls_lock, url_list_lock, processed_urls_lock,
keyword_manager, domain
), name=f"{domain}-InternetMonitor")
monitor_thread.start()

try:
while not exit_event.is_set():
if reconnect_event.is_set():
# İnternet bağlantısı kesildi, thread'leri kapat ve kuyruğu
bekle
[Link]("İnternet bağlantısı kesildi, thread'ler
kapatılıyor.")
for thread in threads:
[Link](timeout=1)
[Link]()

# Bağlantı geri geldiğinde thread'leri yeniden başlat


if check_internet_connection():
[Link]("İnternet bağlantısı geri geldi, thread'ler
yeniden başlatılıyor.")
start_threads(detected_logins_session,
error_logins_session, output_lock, num_instances, domain,
url_queue, url_list, processed_urls,
error_urls, exit_event,
json_lock, error_urls_lock, url_list_lock,
processed_urls_lock, keyword_manager, threads)
reconnect_event.clear()
[Link](1)
except KeyboardInterrupt:
[Link]("Ana süreçte KeyboardInterrupt alındı. Shutdown
başlatılıyor...")
signal_handler(None, None, exit_event, threads)

# Worker thread'lerini bekleyin


for thread in threads:
[Link]()

monitor_thread.join()

# Bilinen login URL'lerini sonuçlara ekleyin


known_logins = present_known_domains.get(domain, set())
if known_logins:
with output_lock:
for login_url in known_logins:
detected_logins_session.append({
"url": login_url,
"detected_at": [Link]().isoformat(),
"source": "present_known_domains"
})
[Link](f"Added known login URLs for {domain} to detected
logins.")

# Detected logins JSON dosyasını yazın


if detected_logins_session:
detected_json_file = [Link](detected_logs_directory,
f"{[Link]('.', '_')}_detected_logins.json")
write_detected_logins_to_json(domain, detected_logins_session,
detected_json_file, json_lock)

# Prepare the session data


session_data = {
"domain": domain,
"detected_logins": detected_logins_session
}
# Post the session data by enqueueing it
post_session_data(env, session_data)

# Error loglarını JSON dosyasına yazın


if error_logins_session:
error_json_file = [Link](error_logs_directory,
f"{[Link]('.', '_')}_errors.json")
write_errors_to_json(domain, error_logins_session, error_json_file,
json_lock)

# Per-session verilerini temizleyin


with error_urls_lock:
error_logins_session.clear()

with url_list_lock:
url_list.clear()
[Link](f"url_list temizlendi: {domain} için.")

with processed_urls_lock:
processed_urls.clear()
[Link](f"processed_urls temizlendi: {domain} için.")

[Link]()
[Link](f"Thread listesi temizlendi: {domain} için.")
[Link](f"Tüm siteler işlendi ve {domain} için JSON dosyalarına
yazıldı.")

# Tüm domain'ler işlendiğinde

# Postlama kuyruğunu kapatın ve iş parçacığını durdurun


post_stop_event.set()
post_thread.join()
[Link]("Post data worker thread stopped.")

if __name__ == "__main__":
main()

You might also like