March 6, 2025

скрипт Python перевіряє валідність інтернет-адрес

import os
import re
import requests
from urllib.parse import urlparse
import argparse
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

def is_valid_url(url, timeout=5):
"""Check if the URL is valid by both format and accessibility."""
# First check URL format
try:
result = urlparse(url)
format_valid = all([result.scheme, result.netloc])
if not format_valid:
return False

# Check if URL is accessible
response = requests.head(url, timeout=timeout, allow_redirects=True)
return response.status_code < 400
except Exception:
return False

def extract_urls(text):
"""Extract URLs from text using regex."""
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
return re.findall(url_pattern, text)

def process_file(file_path, output_dir, timeout):
"""Process a single file to extract and validate URLs."""
try:
# Create output file path
filename = os.path.basename(file_path)
output_filename = f"valid_urls_{filename}"
output_path = os.path.join(output_dir, output_filename)

# Read input file
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()

# Extract URLs
urls = extract_urls(content)
if not urls:
print(f"No URLs found in {file_path}")
return 0

# Validate URLs
valid_urls = []
with ThreadPoolExecutor(max_workers=10) as executor:
results = list(tqdm(
executor.map(lambda url: is_valid_url(url, timeout), urls),
total=len(urls),
desc=f"Validating URLs in {filename}"
))
valid_urls = [url for url, is_valid in zip(urls, results) if is_valid]

# Write valid URLs to output file
with open(output_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(valid_urls))

print(f"Found {len(urls)} URLs in {file_path}, {len(valid_urls)} are valid")
print(f"Valid URLs saved to {output_path}")
return len(valid_urls)

except Exception as e:
print(f"Error processing {file_path}: {e}")
return 0

def main():
print("URL Validator Tool")
print("-----------------")

# Get input directory interactively
while True:
input_dir = input("Enter the path to the directory containing text files with URLs: ")
if os.path.exists(input_dir) and os.path.isdir(input_dir):
break
else:
print(f"Error: '{input_dir}' is not a valid directory. Please try again.")

# Get optional parameters
output_dir = input("Enter the output directory (press Enter for default 'valid_urls'): ").strip()
if not output_dir:
output_dir = 'valid_urls'

timeout_str = input("Enter timeout in seconds for URL validation (press Enter for default 5): ").strip()
try:
timeout = int(timeout_str) if timeout_str else 5
except ValueError:
print("Invalid timeout value. Using default (5 seconds).")
timeout = 5

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Created output directory: {output_dir}")

# Process all .txt files in the input directory
txt_files = [f for f in os.listdir(input_dir) if f.endswith('.txt')]

if not txt_files:
print(f"No .txt files found in '{input_dir}'")
return

print(f"Found {len(txt_files)} text files to process")

total_valid = 0
for filename in txt_files:
file_path = os.path.join(input_dir, filename)
total_valid += process_file(file_path, output_dir, timeout)

print("\nSummary:")
print("---------")
print(f"Processed {len(txt_files)} text files")
print(f"Found a total of {total_valid} valid URLs")
print(f"Results saved in '{output_dir}' directory")

if __name__ == "__main__":
main()