From be36e77f06db20f362bae41223161abfdeede3ef Mon Sep 17 00:00:00 2001 From: FosterG4 Date: Sat, 26 Jul 2025 01:14:39 +0700 Subject: [PATCH 1/5] Refactor and Improve Proxy Scraper File Improvements: proxyChecker.py: - Split load_proxies_from_file into smaller helper functions - Refactored check() function to reduce complexity - Broke down main() into focused setup functions - Added _prepare_checking_environment, _create_proxy_checker helpers proxyGeolocation.py: - Refactored get_ip_info() with _check_special_addresses helper - Split parse_proxy_list() into focused parsing functions - Simplified _handle_source_analysis with validation helpers - Modularized main() function with environment setup proxyScraper.py: - Enhanced ProxyListApiScraper.handle() with data processing helpers - Refactored scrape() function into configuration and execution phases - Modularized main() with argument parsing and logging setup - Added proper type hints with Optional import --- .github/workflows/tests.yml | 4 +- .gitignore | 52 ++- README.md | 225 +++++++++++-- dev_requirements.txt | 16 +- proxyChecker.py | 593 +++++++++++++++++++++++++++------ proxyGeolocation.py | 504 ++++++++++++++++++++++++++++ proxyScraper.py | 642 +++++++++++++++++++++++++++++------- requirements.txt | 13 +- setup.py | 16 +- user_agents.txt | 37 +++ 10 files changed, 1833 insertions(+), 269 deletions(-) create mode 100644 proxyGeolocation.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d7fdae8..3004d06 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -41,4 +41,6 @@ jobs: - name: Run proxyScraper run: python3 proxyScraper.py -p http - name: Run proxyChecker - run: python3 proxyChecker.py -t 20 -s google.com -l output.txt \ No newline at end of file + run: python3 proxyChecker.py -t 20 -s google.com -l output.txt + - name: Run proxyGeolocation + run: python3 proxyGeolocation.py -i 8.8.8.8 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 0bd7639..82bacc9 100644 --- a/.gitignore +++ b/.gitignore @@ -138,4 +138,54 @@ dmypy.json # Cython debug symbols cython_debug/ -output.txt \ No newline at end of file +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be added to the global gitignore or merged into this project gitignore +.idea/ + +# VS Code +.vscode/ +*.code-workspace + +# Project specific files +.github/copilot-instructions.md +output.txt +test_small.txt +test_local.py +*.txt +!requirements.txt +!dev_requirements.txt +!user_agents.txt +!README.txt + +# Temporary files +*.tmp +*.temp +*.swp +*.swo +*~ + +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# Logs +*.log +logs/ + +# Backup files +*.bak +*.backup + +# IDE files +*.sublime-project +*.sublime-workspace + +# Poetry/PDM (modern Python package managers) +poetry.lock +.pdm.toml \ No newline at end of file diff --git a/README.md b/README.md index 7e93215..cef17f5 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,20 @@ -# Proxy Scraper and Checker +# Proxy Scraper & Checker [![Tests](https://github.com/iw4p/proxy-scraper/actions/workflows/tests.yml/badge.svg)](https://github.com/iw4p/proxy-scraper/actions/workflows/tests.yml) [![Downloads](https://static.pepy.tech/badge/proxyz)](https://pepy.tech/project/proxyz) -Scrape more than 1K HTTP - HTTPS - SOCKS4 - SOCKS5 proxies in less than 2 seconds. +**Fast, reliable proxy scraper that collects 30K+ HTTP/HTTPS/SOCKS proxies from 24+ sources in seconds.** -Scraping fresh public proxies from different sources: +โœจ **Features:** +- โšก **Fast scraping** - All sources scraped concurrently +- ๐Ÿ›ก๏ธ **Smart filtering** - Automatically removes CDN/bad IPs (Cloudflare, etc.) +- ๐ŸŒ **Global coverage** - Proxies from Asia, Europe, Americas +- ๐Ÿ”ง **Easy to use** - Simple CLI interface +- โœ… **Quality checked** - Built-in proxy validation -- [sslproxies.org](http://sslproxies.org) (HTTP, HTTPS) -- [free-proxy-list.net](http://free-proxy-list.net) (HTTP, HTTPS) -- [us-proxy.org](http://us-proxy.org) (HTTP, HTTPS) -- [socks-proxy.net](http://socks-proxy.net) (Socks4, Socks5) -- [proxyscrape.com](https://proxyscrape.com) (HTTP, Socks4, Socks5) -- [proxy-list.download](https://www.proxy-list.download) (HTTP, HTTPS, Socks4, Socks5) -- [geonode.com](https://geonode.com) (HTTP, HTTPS, Socks4, Socks5) +## Installation & Setup -## Installation +### ๐Ÿ“ฆ Option 1: Install from PyPI (Recommended) You can install the package directly from PyPI using `pip`: @@ -23,63 +22,217 @@ You can install the package directly from PyPI using `pip`: pip install proxyz ``` +**Verify installation:** +```bash +proxy_scraper --help +proxy_checker --help +``` + +### ๐Ÿ”ง Option 2: Install from Source Code + Alternatively, you can install dependencies manually if you're working from the source code: ```bash +# Clone the repository +git clone https://github.com/iw4p/proxy-scraper.git +cd proxy-scraper + +# Install dependencies pip3 install -r requirements.txt + +# Test the installation +python proxyScraper.py --help +python proxyChecker.py --help ``` -## Usage +### ๐Ÿ Python Requirements +- **Python 3.9+** (3.9, 3.10, 3.11, 3.12 supported) +- **Dependencies:** httpx, beautifulsoup4, pysocks -### Using the Command-Line Interface +## Quick Start Tutorial -Once installed via `pip`, you can use the command-line tools `proxy_scraper` and `proxy_checker` directly. +### Step 1: Scrape Proxies +```bash +# Get HTTP proxies (basic) +proxy_scraper -p http + +# Get SOCKS5 proxies with detailed output +proxy_scraper -p socks5 -v -#### For Scraping Proxies: +# Save to custom file +proxy_scraper -p http -o my_proxies.txt -v +``` +### Step 2: Check Proxy Quality ```bash -proxy_scraper -p http +# Test scraped proxies (basic) +proxy_checker -l output.txt -t 10 + +# Test against specific site with verbose output +proxy_checker -l output.txt -s https://google.com -v + +# Use random user agents for testing +proxy_checker -l output.txt -r -v +``` + +### Step 3: Complete Workflow Example +```bash +# 1. Scrape HTTP proxies +proxy_scraper -p http -v -o fresh_proxies.txt + +# 2. Check their quality +proxy_checker -l fresh_proxies.txt -t 15 -v + +# 3. Result: output.txt contains only working proxies ``` -- With `-p` or `--proxy`, you can choose your proxy type. Supported proxy types are: **HTTP - HTTPS - Socks (Both 4 and 5) - Socks4 - Socks5**. -- With `-o` or `--output`, specify the output file name where the proxies will be saved. (Default is **output.txt**). -- With `-v` or `--verbose`, increase output verbosity. -- With `-h` or `--help`, show the help message. +## Supported Proxy Types +- **HTTP** - Web traffic +- **HTTPS** - Secure web traffic +- **SOCKS4** - TCP connections +- **SOCKS5** - TCP + UDP connections -#### For Checking Proxies: +## Proxy Sources +We collect proxies from **24 sources**: + +**๐ŸŒ Direct Websites (11 sources)** +- spys.me, free-proxy-list.net, proxyscrape.com, geonode.com +- sslproxies.org, us-proxy.org, socks-proxy.net +- proxy-list.download, proxyscan.io, proxyspace.pro +- freeproxy.lunaproxy.com + +**๐Ÿ“ฆ GitHub Repositories (13 sources)** +- proxifly/free-proxy-list, monosans/proxy-list, TheSpeedX/PROXY-List +- jetkai/proxy-list, roosterkid/openproxylist, mmpx12/proxy-list +- ShiftyTR/Proxy-List, clarketm/proxy-list, sunny9577/proxy-scraper +- zloi-user/hideip.me, almroot/proxylist, aslisk/proxyhttps +- proxy4parsing/proxy-list + +## Advanced Usage + +### CLI Options + +**Scraping:** ```bash -proxy_checker -p http -t 20 -s https://google.com -l output.txt +proxy_scraper -p [-o output.txt] [-v] + +Options: + -p, --proxy Proxy type: http, https, socks, socks4, socks5 + -o, --output Output file (default: output.txt) + -v, --verbose Show detailed statistics ``` -- With `-t` or `--timeout`, set the timeout in seconds after which the proxy is considered dead. (Default is **20**). -- With `-p` or `--proxy`, check HTTPS, HTTP, SOCKS4, or SOCKS5 proxies. (Default is **HTTP**). -- With `-l` or `--list`, specify the path to your proxy list file. (Default is **output.txt**). -- With `-s` or `--site`, check proxies against a specific website like google.com. (Default is **https://google.com**). -- With `-r` or `--random_agent`, use a random user agent per proxy. -- With `-v` or `--verbose`, increase output verbosity. -- With `-h` or `--help`, show the help message. +**Checking:** +```bash +proxy_checker [-l input.txt] [-t timeout] [-s site] [-v] + +Options: + -l, --list Input proxy file (default: output.txt) + -t, --timeout Timeout in seconds (default: 20) + -s, --site Test site (default: https://google.com) + -r, --random_agent Use random user agents + -v, --verbose Show detailed progress +``` + +### From Source Code +```bash +# Clone repository +git clone https://github.com/iw4p/proxy-scraper +cd proxy-scraper + +# Install dependencies +pip install -r requirements.txt + +# Run scraper +python proxyScraper.py -p http -v -### Running Directly from Source +# Check proxies +python proxyChecker.py -l output.txt -v +``` + +## Quality & Performance -If you prefer running the scripts directly from the source code, you can use the following commands: +- โœ… **Automatic filtering** - Removes bad IPs (Cloudflare, CDNs, private ranges) +- ๐Ÿ“Š **Source statistics** - See which sources provide the best proxies +- โšก **Fast concurrent** - All sources scraped simultaneously -#### For Scraping: +## Example Output ```bash -python3 proxyScraper.py -p http +Scraping proxies using 24 sources... +๐Ÿ“Š Source Statistics: +-------------------------------------------------- +ProxyScrapeScraper: 18769 valid, 16408 bad IPs filtered +PlainTextScraper: 13516 valid, 5515 bad IPs filtered +GitHubScraper: 1767 valid, 739 bad IPs filtered +... +Total filtered: 22177 bad IPs (CDN/etc), 1 invalid format +Found 30938 unique valid proxies ``` -#### For Checking: +## ๐ŸŒ Proxy Geolocation & Analysis + +The project includes a powerful geolocation tool to analyze proxy origins and track sources: + +### Features +- **๐Ÿ” IP Geolocation** - Get country, city, ISP, and organization info +- **โ˜๏ธ CDN Detection** - Automatically identifies Cloudflare and other CDNs +- **๐Ÿข Datacenter Detection** - Flags hosting providers and datacenters +- **๐Ÿ“Š Source Tracking** - Maps proxies back to their original sources +- **๐Ÿ’พ JSON Export** - Save analysis results for further processing + +### Usage Examples + +**Analyze single IP:** +```bash +python proxyGeolocation.py -i 104.16.1.31 +``` + +**Analyze proxy file:** +```bash +python proxyGeolocation.py -f output.txt -l 50 +``` + +**Track proxy sources:** +```bash +python proxyGeolocation.py -f output.txt -s --limit 100 +``` + +**Export to JSON:** +```bash +python proxyGeolocation.py -f output.txt -o analysis.json +``` +### Sample Output ```bash -python3 proxyChecker.py -p http -t 20 -s https://google.com -l output.txt +๐Ÿ” Proxy Geolocation Analysis Results +================================================== + +๐Ÿ“Š Summary: +Total proxies analyzed: 50 +Proxies with geolocation data: 45 +Cloudflare proxies: 8 +Datacenter proxies: 12 + +๐ŸŒŽ Countries: + United States (US): 15 + Germany (DE): 8 + Singapore (SG): 6 + ... + +๐Ÿ“‹ Detailed Results: +โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +โ˜๏ธ 104.16.1.31:80 - San Francisco, United States | Cloudflare Inc. +๐ŸŒ 45.79.143.52:3128 - Tokyo, Japan | Linode LLC +๐Ÿข 159.203.61.169:3128 - New York, United States | DigitalOcean ``` ## Good to Know - Dead proxies will be removed, and only alive proxies will remain in the output file. -- This script is capable of scraping SOCKS proxies, but `proxyChecker` currently only checks HTTP(S) proxies. +- The proxy checker supports all proxy types: **HTTP, HTTPS, SOCKS4, and SOCKS5**. +- Use random user agents (`-r` flag) for better success rates when checking proxies. ## Star History diff --git a/dev_requirements.txt b/dev_requirements.txt index f09b308..2b8fd6b 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -1,8 +1,8 @@ -flake8==4.0.1 -flake8-black==0.3.1 -flake8-bugbear==22.4.25 -flake8-builtins==1.5.3 -flake8-commas==2.1.0 -flake8-isort==4.1.1 -flake8-polyfill==1.0.2 -pep8-naming==0.12.1 \ No newline at end of file +flake8>=4.0.1,<8.0.0 +flake8-black>=0.3.1,<1.0.0 +flake8-bugbear>=22.4.25,<25.0.0 +flake8-builtins>=1.5.3,<3.0.0 +flake8-commas>=2.1.0,<5.0.0 +flake8-isort>=4.1.1,<7.0.0 +flake8-polyfill>=1.0.2,<2.0.0 +pep8-naming>=0.12.1,<1.0.0 \ No newline at end of file diff --git a/proxyChecker.py b/proxyChecker.py index a5b8828..6e3ee80 100644 --- a/proxyChecker.py +++ b/proxyChecker.py @@ -1,151 +1,552 @@ import argparse +import concurrent.futures +import logging import random import re import socket +import sys import threading import urllib.request +from pathlib import Path from time import time +from typing import List, Optional, Tuple import socks +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +#fallback user agents (will be extended from user_agents.txt if available) user_agents = [ - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/37.0.2062.94 Chrome/37.0.2062.94 Safari/537.36" - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", - "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", - "Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4", - "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:133.0) Gecko/20100101 Firefox/133.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0", ] -try: - with open("user_agents.txt", "r") as f: - for line in f: - user_agents.append(line.replace("\n", "")) -except FileNotFoundError: - pass +# Load additional user agents from file if available +def load_user_agents() -> None: + """Load user agents from external file if available.""" + try: + user_agents_file = Path("user_agents.txt") + if user_agents_file.exists(): + with open(user_agents_file, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line and line not in user_agents: + user_agents.append(line) + logger.debug(f"Loaded {len(user_agents)} user agents from file") + else: + logger.debug("user_agents.txt not found, using built-in user agents") + except Exception as e: + logger.warning(f"Failed to load user agents from file: {e}") + +# Load user agents at module level +load_user_agents() class Proxy: - def __init__(self, method, proxy): - if method.lower() not in ["http", "https", "socks4", "socks5"]: - raise NotImplementedError("Only HTTP, HTTPS, SOCKS4, and SOCKS5 are supported") - self.method = method.lower() - self.proxy = proxy - - def is_valid(self): - return re.match(r"\d{1,3}(?:\.\d{1,3}){3}(?::\d{1,5})?$", self.proxy) - - def check(self, site, timeout, user_agent, verbose): - if self.method in ["socks4", "socks5"]: - socks.set_default_proxy(socks.SOCKS4 if self.method == "socks4" else socks.SOCKS5, - self.proxy.split(':')[0], int(self.proxy.split(':')[1])) + """Represents a proxy server with validation and checking capabilities.""" + + SUPPORTED_METHODS = ["http", "https", "socks4", "socks5"] + + def __init__(self, method: str, proxy: str): + """ + Initialize a proxy instance. + + Args: + method: Proxy type (http, https, socks4, socks5) + proxy: Proxy address in format 'ip:port' + + Raises: + NotImplementedError: If proxy method is not supported + ValueError: If proxy format is invalid + """ + method = method.lower().strip() + if method not in self.SUPPORTED_METHODS: + raise NotImplementedError(f"Only {', '.join(self.SUPPORTED_METHODS)} are supported, got: {method}") + + self.method = method + self.proxy = proxy.strip() + + # Validate proxy format during initialization + if not self.is_valid(): + raise ValueError(f"Invalid proxy format: {proxy}") + + def is_valid(self) -> bool: + """ + Validate proxy format (IP:port). + + Returns: + True if proxy format is valid, False otherwise + """ + if not self.proxy or ':' not in self.proxy: + return False + + try: + ip, port = self.proxy.split(':', 1) + + # Validate IP format + if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip): + return False + + # Validate IP range (0-255 for each octet) + ip_parts = [int(x) for x in ip.split('.')] + if not all(0 <= part <= 255 for part in ip_parts): + return False + + # Validate port range + port_num = int(port) + if not (1 <= port_num <= 65535): + return False + + return True + except (ValueError, AttributeError): + return False + + def check(self, site: str, timeout: int, user_agent: str, verbose: bool) -> Tuple[bool, float, Optional[Exception]]: + """ + Check if proxy is working by attempting to connect through it. + + Args: + site: Target website to test connection + timeout: Connection timeout in seconds + user_agent: User agent string to use + verbose: Enable verbose logging + + Returns: + Tuple of (is_valid, response_time, error) + """ + if not site.startswith(('http://', 'https://')): + site = f"https://{site}" + + start_time = time() + + try: + if self.method in ["socks4", "socks5"]: + return self._check_socks_proxy(site, timeout, verbose, start_time) + else: + return self._check_http_proxy(site, timeout, user_agent, verbose, start_time) + except Exception as e: + verbose_print(verbose, f"Proxy {self.proxy} failed with unexpected error: {e}") + return False, 0.0, e + + def _check_socks_proxy(self, site: str, timeout: int, verbose: bool, start_time: float) -> Tuple[bool, float, Optional[Exception]]: + """Check SOCKS proxy connectivity.""" + # Store original socket to restore later + original_socket = socket.socket + + try: + ip, port = self.proxy.split(':') + socks_type = socks.SOCKS4 if self.method == "socks4" else socks.SOCKS5 + + socks.set_default_proxy(socks_type, ip, int(port)) socket.socket = socks.socksocket + try: - start_time = time() - urllib.request.urlopen(site, timeout=timeout) - end_time = time() - time_taken = end_time - start_time - verbose_print(verbose, f"Proxy {self.proxy} is valid, time taken: {time_taken}") - return True, time_taken, None - except Exception as e: - verbose_print(verbose, f"Proxy {self.proxy} is not valid, error: {str(e)}") - return False, 0, e - else: - url = self.method + "://" + self.proxy - proxy_support = urllib.request.ProxyHandler({self.method: url}) - opener = urllib.request.build_opener(proxy_support) - urllib.request.install_opener(opener) - req = urllib.request.Request(self.method + "://" + site) - req.add_header("User-Agent", user_agent) - try: - start_time = time() - urllib.request.urlopen(req, timeout=timeout) + response = urllib.request.urlopen(site, timeout=timeout) + response.read(1024) # Read a small amount to ensure connection works end_time = time() time_taken = end_time - start_time - verbose_print(verbose, f"Proxy {self.proxy} is valid, time taken: {time_taken}") + + verbose_print(verbose, f"โœ“ Proxy {self.proxy} ({self.method.upper()}) is valid, time: {time_taken:.2f}s") return True, time_taken, None - except Exception as e: - verbose_print(verbose, f"Proxy {self.proxy} is not valid, error: {str(e)}") - return False, 0, e + + finally: + # Always restore original socket + socket.socket = original_socket + + except Exception as e: + socket.socket = original_socket # Ensure cleanup even on error + verbose_print(verbose, f"โœ— Proxy {self.proxy} ({self.method.upper()}) failed: {e}") + return False, 0.0, e + + def _check_http_proxy(self, site: str, timeout: int, user_agent: str, verbose: bool, start_time: float) -> Tuple[bool, float, Optional[Exception]]: + """Check HTTP/HTTPS proxy connectivity.""" + try: + proxy_url = f"{self.method}://{self.proxy}" + proxy_handler = urllib.request.ProxyHandler({ + 'http': proxy_url, + 'https': proxy_url, + }) + + opener = urllib.request.build_opener(proxy_handler) + + # Create request with proper headers + request = urllib.request.Request(site) + request.add_header("User-Agent", user_agent) + request.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + request.add_header("Accept-Language", "en-US,en;q=0.5") + request.add_header("Accept-Encoding", "gzip, deflate") + request.add_header("Connection", "keep-alive") + + response = opener.open(request, timeout=timeout) + response.read(1024) # Read a small amount to ensure connection works + + end_time = time() + time_taken = end_time - start_time + + verbose_print(verbose, f"โœ“ Proxy {self.proxy} ({self.method.upper()}) is valid, time: {time_taken:.2f}s") + return True, time_taken, None + + except Exception as e: + verbose_print(verbose, f"โœ— Proxy {self.proxy} ({self.method.upper()}) failed: {e}") + return False, 0.0, e - def __str__(self): + def __str__(self) -> str: + """String representation of the proxy.""" return self.proxy + def __repr__(self) -> str: + """Detailed string representation.""" + return f"Proxy(method='{self.method}', proxy='{self.proxy}')" + -def verbose_print(verbose, message): +def verbose_print(verbose: bool, message: str) -> None: + """Print message if verbose mode is enabled.""" if verbose: print(message) -def check(file, timeout, method, site, verbose, random_user_agent): +def _process_proxy_line(line: str, line_num: int, method: str) -> Optional[Proxy]: + """Process a single line from proxy file.""" + line = line.strip() + if not line or line.startswith('#'): # Skip empty lines and comments + return None + + try: + return Proxy(method, line) + except (ValueError, NotImplementedError) as e: + logger.debug(f"Line {line_num}: Invalid proxy '{line}' - {e}") + return None + + +def _read_proxy_file(file_path: str) -> List[str]: + """Read and return lines from proxy file.""" + try: + with open(file_path, "r", encoding="utf-8") as f: + return list(f) + except FileNotFoundError: + logger.error(f"Proxy file not found: {file_path}") + sys.exit(1) + except Exception as e: + logger.error(f"Error reading proxy file {file_path}: {e}") + sys.exit(1) + + +def load_proxies_from_file(file_path: str, method: str) -> List[Proxy]: + """ + Load proxies from file and create Proxy objects. + + Args: + file_path: Path to proxy list file + method: Proxy method to use + + Returns: + List of valid Proxy objects + """ proxies = [] - with open(file, "r") as f: - for line in f: - proxies.append(Proxy(method, line.replace("\n", ""))) + invalid_count = 0 + + lines = _read_proxy_file(file_path) + + for line_num, line in enumerate(lines, 1): + proxy = _process_proxy_line(line, line_num, method) + if proxy is not None: + proxies.append(proxy) + else: + if line.strip() and not line.strip().startswith('#'): + invalid_count += 1 + + if invalid_count > 0: + logger.warning(f"Skipped {invalid_count} invalid proxy entries") + + return proxies - print(f"Checking {len(proxies)} proxies") - proxies = filter(lambda x: x.is_valid(), proxies) - valid_proxies = [] - user_agent = random.choice(user_agents) - def check_proxy(proxy, user_agent): - new_user_agent = user_agent - if random_user_agent: - new_user_agent = random.choice(user_agents) - valid, time_taken, error = proxy.check(site, timeout, new_user_agent, verbose) - valid_proxies.extend([proxy] if valid else []) +def save_valid_proxies(file_path: str, valid_proxies: List[Proxy]) -> None: + """ + Save valid proxies back to file. + + Args: + file_path: Output file path + valid_proxies: List of valid proxies to save + """ + try: + # Sort proxies for consistent output + sorted_proxies = sorted(valid_proxies, key=lambda p: p.proxy) + + with open(file_path, "w", encoding="utf-8") as f: + for proxy in sorted_proxies: + f.write(f"{proxy}\n") + + logger.info(f"Saved {len(valid_proxies)} valid proxies to {file_path}") + + except Exception as e: + logger.error(f"Failed to save proxies to {file_path}: {e}") + raise - threads = [] - for proxy in proxies: - t = threading.Thread(target=check_proxy, args=(proxy, user_agent)) - threads.append(t) - for t in threads: - t.start() +def _prepare_checking_environment(file: str, method: str, site: str, timeout: int, random_user_agent: bool) -> Tuple[List[Proxy], str, int]: + """Prepare the environment for proxy checking.""" + print(f"Loading proxies from {file}...") + proxies = load_proxies_from_file(file, method) + print(f"Loaded {len(proxies)} valid proxies for checking") + + if not proxies: + print("No valid proxies found to check") + return [], "", 0 + + # Choose base user agent + base_user_agent = random.choice(user_agents) + + # Print checking parameters + max_threads = min(len(proxies), 100) + print(f"Starting proxy validation with {max_threads} concurrent threads...") + print(f"Target site: {site}") + print(f"Timeout: {timeout}s") + print(f"Method: {method.upper()}") + print(f"User agent strategy: {'Random per proxy' if random_user_agent else 'Fixed'}") + print("-" * 60) + + return proxies, base_user_agent, max_threads - for t in threads: - t.join() - with open(file, "w") as f: - for proxy in valid_proxies: - f.write(str(proxy) + "\n") +def _create_proxy_checker(valid_proxies: List[Proxy], checked_count_ref: List[int], lock: threading.Lock, + site: str, timeout: int, random_user_agent: bool, base_user_agent: str, + total_proxies: int, verbose: bool): + """Create a proxy checking function with proper closure.""" + def check_single_proxy(proxy: Proxy) -> None: + """Check a single proxy and update results.""" + try: + # Select user agent + current_user_agent = random.choice(user_agents) if random_user_agent else base_user_agent + + # Check proxy + is_valid, response_time, error = proxy.check(site, timeout, current_user_agent, verbose) + + # Update results thread-safely + with lock: + checked_count_ref[0] += 1 + + if is_valid: + valid_proxies.append(proxy) + + # Progress indicator + if not verbose and checked_count_ref[0] % 50 == 0: + print(f"Progress: {checked_count_ref[0]}/{total_proxies} ({len(valid_proxies)} valid)") + + except Exception as e: + logger.debug(f"Unexpected error checking proxy {proxy}: {e}") + + return check_single_proxy - print(f"Found {len(valid_proxies)} valid proxies") + +def check(file: str, timeout: int, method: str, site: str, verbose: bool, random_user_agent: bool) -> None: + """ + Main proxy checking function. + + Args: + file: Path to proxy list file + timeout: Connection timeout in seconds + method: Proxy method to check + site: Target website for testing + verbose: Enable verbose output + random_user_agent: Use random user agent per proxy + """ + start_time = time() + + # Prepare checking environment + proxies, base_user_agent, max_threads = _prepare_checking_environment( + file, method, site, timeout, random_user_agent, + ) + + if not proxies: + return + + # Initialize checking state + valid_proxies = [] + checked_count_ref = [0] # Use list for mutable reference + lock = threading.Lock() + + # Create checker function + check_single_proxy = _create_proxy_checker( + valid_proxies, checked_count_ref, lock, site, timeout, + random_user_agent, base_user_agent, len(proxies), verbose, + ) + + # Execute checking with thread pool + with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: + futures = [executor.submit(check_single_proxy, proxy) for proxy in proxies] + + try: + concurrent.futures.wait(futures, timeout=None) + except KeyboardInterrupt: + print("\nChecking interrupted by user") + executor.shutdown(wait=False) + return + + # Save results + save_valid_proxies(file, valid_proxies) + + # Final statistics + elapsed_time = time() - start_time + success_rate = (len(valid_proxies) / len(proxies)) * 100 if proxies else 0 + + print("-" * 60) + print("Proxy checking completed!") + print(f"Total checked: {len(proxies)}") + print(f"Valid proxies: {len(valid_proxies)}") + print(f"Success rate: {success_rate:.1f}%") + print(f"Time taken: {elapsed_time:.2f} seconds") + print(f"Average time per proxy: {elapsed_time/len(proxies):.2f}s") + + if len(valid_proxies) == 0: + print("โš ๏ธ No working proxies found. Consider:") + print(" - Increasing timeout value") + print(" - Trying a different target site") + print(" - Using fresh proxy list") -def main(): - parser = argparse.ArgumentParser() +def _setup_argument_parser() -> argparse.ArgumentParser: + """Set up and configure the argument parser.""" + parser = argparse.ArgumentParser( + description="Check proxy servers for connectivity and validity", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s -p http -t 10 -v # Check HTTP proxies with 10s timeout + %(prog)s -p socks4 -l socks.txt -r # Check SOCKS4 with random user agents + %(prog)s -p https -s httpbin.org/ip --debug # Check HTTPS proxies against custom site + +Notes: + - Dead proxies are automatically removed from the list file + - Use --debug for detailed error information + - Higher timeout values may find more working proxies but take longer + """, + ) + parser.add_argument( - "-t", - "--timeout", + "-t", "--timeout", type=int, - help="Dismiss the proxy after -t seconds", default=20, + help="Connection timeout in seconds (default: %(default)s)", ) - parser.add_argument("-p", "--proxy", help="Check HTTPS, HTTP, SOCKS4, or SOCKS5 proxies", default="http") - parser.add_argument("-l", "--list", help="Path to your proxy list file", default="output.txt") parser.add_argument( - "-s", - "--site", - help="Check with specific website like google.com", - default="https://google.com/", + "-p", "--proxy", + choices=Proxy.SUPPORTED_METHODS, + default="http", + help="Proxy type to check (default: %(default)s)", ) parser.add_argument( - "-v", - "--verbose", - help="Increase output verbosity", + "-l", "--list", + default="output.txt", + help="Path to proxy list file (default: %(default)s)", + ) + parser.add_argument( + "-s", "--site", + default="https://httpbin.org/ip", + help="Target website for testing (default: %(default)s)", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", + help="Enable verbose output showing each proxy check", ) parser.add_argument( - "-r", - "--random_agent", - help="Use a random user agent per proxy", + "-r", "--random_agent", action="store_true", + help="Use a different random user agent for each proxy", ) + parser.add_argument( + "--debug", + action="store_true", + help="Enable debug logging for troubleshooting", + ) + parser.add_argument( + "--max-threads", + type=int, + default=100, + help="Maximum number of concurrent threads (default: %(default)s)", + ) + + return parser + + +def _configure_logging_and_validate_args(args) -> str: + """Configure logging and validate arguments.""" + # Configure logging + if args.debug: + logging.getLogger().setLevel(logging.DEBUG) + elif args.verbose: + logging.getLogger().setLevel(logging.INFO) + else: + logging.getLogger().setLevel(logging.WARNING) + + # Validate arguments + if args.timeout <= 0: + print("Error: Timeout must be positive") + sys.exit(1) + + if args.max_threads <= 0: + print("Error: max-threads must be positive") + sys.exit(1) + + # Check if proxy file exists + if not Path(args.list).exists(): + print(f"Error: Proxy file '{args.list}' not found") + print("Tip: Run the proxy scraper first to generate a proxy list") + sys.exit(1) + + # Normalize site URL + site = args.site + if not site.startswith(('http://', 'https://')): + site = f"https://{site}" + + return site + + +def main() -> None: + """Main entry point for the proxy checker.""" + parser = _setup_argument_parser() args = parser.parse_args() - check(file=args.list, timeout=args.timeout, method=args.proxy, site=args.site, verbose=args.verbose, - random_user_agent=args.random_agent) + + # Configure logging and validate arguments + site = _configure_logging_and_validate_args(args) + + # Display startup information + print("๐Ÿ” Proxy Checker v2.0") + print(f"๐Ÿ“ Proxy file: {args.list}") + print(f"๐ŸŽฏ Target site: {site}") + print(f"โฑ๏ธ Timeout: {args.timeout}s") + print(f"๐Ÿ”ง Method: {args.proxy.upper()}") + print(f"๐Ÿงต Max threads: {args.max_threads}") + print(f"๐Ÿ‘ค User agents: {len(user_agents)} available") + print("=" * 60) + + try: + check( + file=args.list, + timeout=args.timeout, + method=args.proxy, + site=site, + verbose=args.verbose, + random_user_agent=args.random_agent, + ) + + except KeyboardInterrupt: + print("\nโš ๏ธ Operation interrupted by user") + sys.exit(1) + except Exception as e: + logger.error(f"Proxy checking failed: {e}") + if args.debug: + import traceback + traceback.print_exc() + sys.exit(1) if __name__ == "__main__": diff --git a/proxyGeolocation.py b/proxyGeolocation.py new file mode 100644 index 0000000..152d976 --- /dev/null +++ b/proxyGeolocation.py @@ -0,0 +1,504 @@ +#!/usr/bin/env python3 +""" +Proxy Geolocation and Source Tracking Tool +Identifies proxy origins and tracks which sources provide which proxies. +""" + +import argparse +import asyncio +import json +import logging +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import httpx + +# Setup logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +@dataclass +class ProxyInfo: + """Information about a proxy including its geolocation and source.""" + ip: str + port: str + country: Optional[str] = None + country_code: Optional[str] = None + city: Optional[str] = None + region: Optional[str] = None + org: Optional[str] = None + isp: Optional[str] = None + source: Optional[str] = None + is_cloudflare: bool = False + is_datacenter: bool = False + +class ProxyGeolocator: + """Main class for proxy geolocation and source tracking.""" + + def __init__(self): + self.session: Optional[httpx.AsyncClient] = None + + async def __aenter__(self): + """Async context manager entry.""" + self.session = httpx.AsyncClient( + timeout=httpx.Timeout(30.0), + limits=httpx.Limits(max_connections=10, max_keepalive_connections=5), + ) + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit.""" + if self.session: + await self.session.aclose() + + def _check_special_addresses(self, ip: str, proxy_info: ProxyInfo) -> bool: + """Check for special/reserved addresses. Returns True if special address found.""" + try: + import ipaddress + ip_obj = ipaddress.ip_address(ip) + + if str(ip_obj) == "0.0.0.0": + proxy_info.org = "Reserved: 'This host' address" + proxy_info.country = "Invalid" + return True + elif ip_obj.is_private: + proxy_info.org = "Private network address" + proxy_info.country = "Local" + return True + elif ip_obj.is_loopback: + proxy_info.org = "Loopback address" + proxy_info.country = "Local" + return True + elif ip_obj.is_reserved: + proxy_info.org = "Reserved address" + proxy_info.country = "Invalid" + return True + + return False + except Exception: + return False + + def _process_geolocation_data(self, data: dict, proxy_info: ProxyInfo) -> None: + """Process geolocation API response data.""" + if data.get("status") != "success": + return + + proxy_info.country = data.get("country") + proxy_info.country_code = data.get("countryCode") + proxy_info.city = data.get("city") + proxy_info.region = data.get("region") + proxy_info.org = data.get("org") + proxy_info.isp = data.get("isp") + + # Check if it's Cloudflare + org_lower = (data.get("org") or "").lower() + isp_lower = (data.get("isp") or "").lower() + if "cloudflare" in org_lower or "cloudflare" in isp_lower: + proxy_info.is_cloudflare = True + + # Check if it's a datacenter + datacenter_keywords = ["datacenter", "hosting", "server", "cloud", "digital ocean", "aws", "amazon", "google", "microsoft"] + if any(keyword in org_lower or keyword in isp_lower for keyword in datacenter_keywords): + proxy_info.is_datacenter = True + + async def get_ip_info(self, ip: str) -> ProxyInfo: + """Get geolocation information for an IP address.""" + proxy_info = ProxyInfo(ip=ip, port="") + + # Check for special/reserved addresses first + if self._check_special_addresses(ip, proxy_info): + return proxy_info + + try: + # Use ip-api.com for geolocation (free, no API key needed) + url = f"http://ip-api.com/json/{ip}?fields=status,message,country,countryCode,region,city,org,isp,as" + + if not self.session: + raise RuntimeError("Session not initialized") + + response = await self.session.get(url) + response.raise_for_status() + + data = response.json() + self._process_geolocation_data(data, proxy_info) + + except Exception as e: + logger.debug(f"Error getting IP info for {ip}: {e}") + + return proxy_info + + def _parse_proxy_line(self, line: str, line_num: int) -> Optional[Tuple[str, int]]: + """Parse a single proxy line. Returns None if invalid.""" + line = line.strip() + if not line or line.startswith('#'): + return None + + if ':' not in line: + return None + + try: + ip, port = line.split(':', 1) + ip = ip.strip() + port = int(port.strip()) + return (ip, port) + except ValueError: + logger.warning(f"Invalid proxy format on line {line_num}: {line}") + return None + + def _read_proxy_file_lines(self, file_path: str) -> List[str]: + """Read all lines from proxy file.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return list(f) + except FileNotFoundError: + logger.error(f"Proxy file not found: {file_path}") + return [] + except Exception as e: + logger.error(f"Error reading proxy file: {e}") + return [] + + def parse_proxy_list(self, file_path: str) -> List[Tuple[str, int]]: + """Parse proxy list file and return list of (ip, port) tuples.""" + proxies = [] + lines = self._read_proxy_file_lines(file_path) + + for line_num, line in enumerate(lines, 1): + proxy = self._parse_proxy_line(line, line_num) + if proxy is not None: + proxies.append(proxy) + + return proxies + + async def analyze_proxies(self, proxy_list: List[Tuple[str, int]], limit: Optional[int] = None) -> List[ProxyInfo]: + """Analyze a list of proxies and get their geolocation info.""" + if limit: + proxy_list = proxy_list[:limit] + + logger.info(f"๐ŸŒ Analyzing {len(proxy_list)} proxies for geolocation...") + + results = [] + for i, (ip, port) in enumerate(proxy_list, 1): + logger.info(f"๐Ÿ“ Analyzing {i}/{len(proxy_list)}: {ip}:{port}") + + proxy_info = await self.get_ip_info(ip) + proxy_info.port = str(port) + results.append(proxy_info) + + # Small delay to be respectful to the API + await asyncio.sleep(0.1) + + return results + + def _calculate_summary_stats(self, results: List[ProxyInfo]) -> Tuple[Dict[str, int], int, int, int]: + """Calculate summary statistics from proxy results.""" + countries = {} + cloudflare_count = 0 + datacenter_count = 0 + valid_info_count = 0 + + for proxy in results: + if proxy.country: + valid_info_count += 1 + country_key = f"{proxy.country} ({proxy.country_code})" if proxy.country_code else proxy.country + countries[country_key] = countries.get(country_key, 0) + 1 + + if proxy.is_cloudflare: + cloudflare_count += 1 + if proxy.is_datacenter: + datacenter_count += 1 + + return countries, cloudflare_count, datacenter_count, valid_info_count + + def _print_summary_stats(self, results: List[ProxyInfo], countries: Dict[str, int], + cloudflare_count: int, datacenter_count: int, valid_info_count: int): + """Print summary statistics.""" + print("\n๐Ÿ“Š Summary:") + print(f"Total proxies analyzed: {len(results)}") + print(f"Proxies with geolocation data: {valid_info_count}") + print(f"Cloudflare proxies: {cloudflare_count}") + print(f"Datacenter proxies: {datacenter_count}") + + if countries: + print("\n๐ŸŒŽ Countries:") + for country, count in sorted(countries.items(), key=lambda x: x[1], reverse=True): + print(f" {country}: {count}") + + def _format_proxy_details(self, proxy: ProxyInfo) -> str: + """Format proxy details for display.""" + flag = "๐Ÿ”" + if proxy.is_cloudflare: + flag = "โ˜๏ธ" + elif proxy.is_datacenter: + flag = "๐Ÿข" + elif proxy.country: + flag = "๐ŸŒ" + + location = "Unknown" + if proxy.city and proxy.country: + location = f"{proxy.city}, {proxy.country}" + elif proxy.country: + location = proxy.country + + org_info = "" + if proxy.org: + org_info = f" | {proxy.org}" + if proxy.isp and proxy.isp != proxy.org: + org_info += f" | ISP: {proxy.isp}" + + return f"{flag} {proxy.ip}:{proxy.port} - {location}{org_info}" + + def print_analysis_results(self, results: List[ProxyInfo], show_details: bool = True): + """Print analysis results in a formatted way.""" + if not results: + print("โŒ No proxy data to analyze") + return + + print("\n๐Ÿ” Proxy Geolocation Analysis Results") + print("=" * 50) + + # Calculate summary statistics + countries, cloudflare_count, datacenter_count, valid_info_count = self._calculate_summary_stats(results) + + # Print summary + self._print_summary_stats(results, countries, cloudflare_count, datacenter_count, valid_info_count) + + if show_details: + print("\n๐Ÿ“‹ Detailed Results:") + print("-" * 80) + + for proxy in results: + print(self._format_proxy_details(proxy)) + + def save_results_json(self, results: List[ProxyInfo], output_file: str): + """Save results to JSON file.""" + data = [] + for proxy in results: + data.append({ + "ip": proxy.ip, + "port": proxy.port, + "country": proxy.country, + "country_code": proxy.country_code, + "city": proxy.city, + "region": proxy.region, + "org": proxy.org, + "isp": proxy.isp, + "is_cloudflare": proxy.is_cloudflare, + "is_datacenter": proxy.is_datacenter, + "source": proxy.source, + }) + + try: + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) + print(f"๐Ÿ’พ Results saved to: {output_file}") + except Exception as e: + logger.error(f"Error saving results: {e}") + + async def analyze_proxy_sources(self, proxy_file: str, limit: Optional[int] = None) -> Dict[str, List[str]]: + """Analyze which source each proxy likely came from by checking current scraper results.""" + # Dynamic import to avoid circular dependency + try: + import proxyScraper + scrapers = proxyScraper.scrapers + except ImportError: + logger.warning("Could not import proxyScraper - source analysis unavailable") + return {} + + # Load proxies from file + proxies = self.parse_proxy_list(proxy_file) + if limit: + proxies = proxies[:limit] + + proxy_set = {f"{ip}:{port}" for ip, port in proxies} + source_map = {} + + logger.info(f"๐Ÿ” Analyzing sources for {len(proxy_set)} proxies...") + + # Check each scraper + client_config = { + "follow_redirects": True, + "timeout": 30.0, + "limits": httpx.Limits(max_keepalive_connections=20, max_connections=100), + "headers": { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + }, + } + + async with httpx.AsyncClient(**client_config) as client: + for scraper in scrapers: + try: + logger.info(f"๏ฟฝ Checking {scraper.source_name}...") + scraped_proxies, _ = await scraper.scrape(client) + scraped_set = set(scraped_proxies) + + # Find matches + matches = proxy_set.intersection(scraped_set) + if matches: + source_map[scraper.source_name] = list(matches) + logger.info(f" Found {len(matches)} matches") + + await asyncio.sleep(0.5) # Be respectful to sources + + except Exception as e: + logger.debug(f"Error checking {scraper.source_name}: {e}") + + return source_map + + async def check_single_ip(self, ip: str) -> ProxyInfo: + """Check a single IP address.""" + logger.info(f"๐Ÿ” Checking IP: {ip}") + return await self.get_ip_info(ip) + +def _setup_argument_parser(): + """Set up command line argument parser.""" + parser = argparse.ArgumentParser( + description="Proxy Geolocation and Source Tracking Tool", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python proxyGeolocation.py -i 104.16.1.31 + python proxyGeolocation.py -f output.txt -l 20 + python proxyGeolocation.py -f output.txt -s --limit 50 + python proxyGeolocation.py -f output.txt -o results.json + python proxyGeolocation.py -f output.txt --no-details + """, + ) + + parser.add_argument("-i", "--ip", type=str, help="Check single IP address") + parser.add_argument("-f", "--file", type=str, help="Path to proxy list file (default: output.txt)") + parser.add_argument("-s", "--sources", action="store_true", help="Analyze which sources provide which proxies") + parser.add_argument("-l", "--limit", type=int, help="Limit number of proxies to analyze") + parser.add_argument("-o", "--output", type=str, help="Save results to JSON file") + parser.add_argument("--no-details", action="store_true", help="Show only summary, no detailed results") + parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose logging") + + return parser + +async def _handle_single_ip(geolocator, args): + """Handle single IP analysis.""" + result = await geolocator.check_single_ip(args.ip) + geolocator.print_analysis_results([result], show_details=True) + + if args.output: + geolocator.save_results_json([result], args.output) + +def _validate_proxy_file(proxy_file: str) -> bool: + """Validate that proxy file exists.""" + if not Path(proxy_file).exists(): + print(f"โŒ Proxy file not found: {proxy_file}") + print("๐Ÿ’ก Run proxy scraper first: python proxyScraper.py -p http") + return False + return True + +def _print_source_summary(source_map: dict, total_mapped: int) -> None: + """Print source analysis summary.""" + print("\n๐Ÿ” Proxy Source Analysis Results") + print("=" * 50) + print(f"Total proxies mapped to sources: {total_mapped}") + +def _print_source_details(source_map: dict, show_details: bool) -> None: + """Print detailed source information.""" + if not source_map: + return + + print("\n๐Ÿ“Š Sources:") + for source, proxy_list in sorted(source_map.items(), key=lambda x: len(x[1]), reverse=True): + print(f" {source}: {len(proxy_list)} proxies") + if not show_details: + continue + + # Show first few proxies as examples + for proxy in proxy_list[:5]: + print(f" - {proxy}") + if len(proxy_list) > 5: + print(f" ... and {len(proxy_list) - 5} more") + print() + +async def _handle_source_analysis(geolocator, args): + """Handle proxy source analysis.""" + proxy_file = args.file or "output.txt" + + if not _validate_proxy_file(proxy_file): + return + + source_map = await geolocator.analyze_proxy_sources(proxy_file, args.limit) + total_mapped = sum(len(proxies) for proxies in source_map.values()) + + _print_source_summary(source_map, total_mapped) + _print_source_details(source_map, not args.no_details) + + if args.output: + output_data = { + "analysis_type": "source_mapping", + "total_mapped": total_mapped, + "sources": source_map, + } + try: + with open(args.output, 'w', encoding='utf-8') as f: + json.dump(output_data, f, indent=2) + print(f"๐Ÿ’พ Source analysis saved to: {args.output}") + except Exception as e: + logger.error(f"Error saving results: {e}") + +async def _handle_file_analysis(geolocator, args): + """Handle proxy file analysis.""" + proxy_file = args.file or "output.txt" + + if not Path(proxy_file).exists(): + print(f"โŒ Proxy file not found: {proxy_file}") + print("๐Ÿ’ก Run proxy scraper first: python proxyScraper.py -p http") + return + + proxies = geolocator.parse_proxy_list(proxy_file) + + if not proxies: + print(f"โŒ No valid proxies found in {proxy_file}") + return + + results = await geolocator.analyze_proxies(proxies, args.limit) + geolocator.print_analysis_results(results, show_details=not args.no_details) + + if args.output: + geolocator.save_results_json(results, args.output) + +def _configure_environment(args) -> None: + """Configure logging and environment settings.""" + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + # Handle Windows event loop + if sys.platform.startswith('win'): + asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) + +async def _run_analysis_based_on_args(geolocator, args): + """Run analysis based on command line arguments.""" + if args.ip: + await _handle_single_ip(geolocator, args) + elif args.sources: + await _handle_source_analysis(geolocator, args) + else: + await _handle_file_analysis(geolocator, args) + +def main(): + """Main function for CLI usage.""" + parser = _setup_argument_parser() + args = parser.parse_args() + + _configure_environment(args) + + async def run_analysis(): + async with ProxyGeolocator() as geolocator: + await _run_analysis_based_on_args(geolocator, args) + + # Run the analysis + try: + asyncio.run(run_analysis()) + except KeyboardInterrupt: + print("\nโน๏ธ Analysis interrupted by user") + except Exception as e: + logger.error(f"Analysis failed: {e}") + +if __name__ == "__main__": + main() diff --git a/proxyScraper.py b/proxyScraper.py index ec00038..c774462 100644 --- a/proxyScraper.py +++ b/proxyScraper.py @@ -1,68 +1,201 @@ import argparse import asyncio +import ipaddress +import logging import platform import re import sys import time +from typing import Dict, List, Optional, Set, Tuple import httpx from bs4 import BeautifulSoup +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# Known bad IP ranges to filter out (Cloudflare, major CDNs, etc.) +BAD_IP_RANGES = [ + # Cloudflare + "173.245.48.0/20", + "103.21.244.0/22", + "103.22.200.0/22", + "103.31.4.0/22", + "141.101.64.0/18", + "108.162.192.0/18", + "190.93.240.0/20", + "188.114.96.0/20", + "197.234.240.0/22", + "198.41.128.0/17", + "162.158.0.0/15", + "104.16.0.0/13", # This includes our problematic IP 104.16.1.31 + "104.24.0.0/14", + "172.64.0.0/13", + "131.0.72.0/22", + # Amazon CloudFront + "13.32.0.0/15", + "13.35.0.0/17", + "18.160.0.0/15", + "52.222.128.0/17", + "54.182.0.0/16", + "54.192.0.0/16", + "54.230.0.0/16", + "54.239.128.0/18", + "99.86.0.0/16", + "205.251.200.0/21", + "216.137.32.0/19", +] + +def is_bad_ip(ip: str) -> bool: + """Check if an IP is in a known bad range (CDN, etc.) or is a reserved address.""" + try: + ip_obj = ipaddress.ip_address(ip) + + # Check for reserved/special addresses + if ip_obj.is_private or ip_obj.is_loopback or ip_obj.is_reserved or ip_obj.is_multicast: + return True + + # Check for specific bad addresses + if str(ip_obj) in ["0.0.0.0", "255.255.255.255", "127.0.0.1"]: + return True + + # Check against known bad ranges (CDNs) + for cidr in BAD_IP_RANGES: + if ip_obj in ipaddress.ip_network(cidr): + return True + + except (ValueError, ipaddress.AddressValueError): + return True # Invalid IP format + return False + class Scraper: + """Base scraper class for proxy sources.""" - def __init__(self, method, _url): + def __init__(self, method: str, _url: str, timeout: int = 10): self.method = method self._url = _url + self.timeout = timeout + self.source_name = self.__class__.__name__ - def get_url(self, **kwargs): + def get_url(self, **kwargs) -> str: + """Get the formatted URL for the scraper.""" return self._url.format(**kwargs, method=self.method) - async def get_response(self, client): - return await client.get(self.get_url()) + async def get_response(self, client: httpx.AsyncClient) -> httpx.Response: + """Get HTTP response from the proxy source.""" + return await client.get(self.get_url(), timeout=self.timeout) - async def handle(self, response): + async def handle(self, response: httpx.Response) -> str: + """Handle the response and extract proxy data.""" return response.text - async def scrape(self, client): - response = await self.get_response(client) - proxies = await self.handle(response) - pattern = re.compile(r"\d{1,3}(?:\.\d{1,3}){3}(?::\d{1,5})?") - return re.findall(pattern, proxies) + def filter_proxies(self, proxy_text: str) -> Tuple[Set[str], Dict[str, int]]: + """Filter proxies and return valid ones with statistics.""" + proxies = set() + stats = {"total": 0, "filtered_bad": 0, "filtered_invalid": 0, "valid": 0} + + for line in proxy_text.split('\n'): + line = line.strip() + if not line: + continue + + stats["total"] += 1 + + # Basic format validation + if ':' not in line: + stats["filtered_invalid"] += 1 + continue + + try: + ip, port = line.split(':', 1) + ip = ip.strip() + port = port.strip() + + # Validate IP format + ipaddress.ip_address(ip) + + # Validate port + port_num = int(port) + if not (1 <= port_num <= 65535): + stats["filtered_invalid"] += 1 + continue + + # Check if it's a bad IP (CDN, etc.) + if is_bad_ip(ip): + stats["filtered_bad"] += 1 + logger.debug(f"Filtered bad IP from {self.source_name}: {ip}:{port}") + continue + + proxies.add(f"{ip}:{port}") + stats["valid"] += 1 + + except (ValueError, ipaddress.AddressValueError): + stats["filtered_invalid"] += 1 + continue + + return proxies, stats + + async def scrape(self, client: httpx.AsyncClient) -> Tuple[List[str], Dict[str, int]]: + """Scrape proxies from the source.""" + try: + response = await self.get_response(client) + response.raise_for_status() # Raise an exception for bad status codes + proxy_text = await self.handle(response) + + # Use regex to find all potential proxies + pattern = re.compile(r"\d{1,3}(?:\.\d{1,3}){3}(?::\d{1,5})?") + raw_proxies = re.findall(pattern, proxy_text) + + # Filter and validate proxies + valid_proxies, stats = self.filter_proxies('\n'.join(raw_proxies)) + + return list(valid_proxies), stats + except Exception as e: + logger.debug(f"Failed to scrape from {self.source_name} ({self.get_url()}): {e}") + return [], {"total": 0, "filtered_bad": 0, "filtered_invalid": 0, "valid": 0} # From spys.me class SpysMeScraper(Scraper): + """Scraper for spys.me proxy source.""" - def __init__(self, method): - super().__init__(method, "https://spys.me/{mode}.txt") + def __init__(self, method: str): + super().__init__(method, "https://spys.me/{mode}.txt", timeout=15) - def get_url(self, **kwargs): + def get_url(self, **kwargs) -> str: + """Get URL with appropriate mode for the proxy method.""" mode = "proxy" if self.method == "http" else "socks" if self.method == "socks" else "unknown" if mode == "unknown": - raise NotImplementedError + raise NotImplementedError(f"Method {self.method} not supported by SpysMeScraper") return super().get_url(mode=mode, **kwargs) # From proxyscrape.com class ProxyScrapeScraper(Scraper): + """Scraper for proxyscrape.com API.""" - def __init__(self, method, timeout=1000, country="All"): - self.timout = timeout + def __init__(self, method: str, timeout: int = 1000, country: str = "All"): + self.api_timeout = timeout # Renamed to avoid confusion with HTTP timeout self.country = country super().__init__(method, "https://api.proxyscrape.com/?request=getproxies" "&proxytype={method}" - "&timeout={timout}" - "&country={country}") + "&timeout={api_timeout}" + "&country={country}", + timeout=20) # HTTP timeout - def get_url(self, **kwargs): - return super().get_url(timout=self.timout, country=self.country, **kwargs) + def get_url(self, **kwargs) -> str: + """Get URL with API parameters.""" + return super().get_url(api_timeout=self.api_timeout, country=self.country, **kwargs) # From geonode.com - A little dirty, grab http(s) and socks but use just for socks class GeoNodeScraper(Scraper): + """Scraper for geonode.com proxy API.""" - def __init__(self, method, limit="500", page="1", sort_by="lastChecked", sort_type="desc"): + def __init__(self, method: str, limit: str = "500", page: str = "1", + sort_by: str = "lastChecked", sort_type: str = "desc"): self.limit = limit self.page = page self.sort_by = sort_by @@ -72,90 +205,219 @@ def __init__(self, method, limit="500", page="1", sort_by="lastChecked", sort_ty "&limit={limit}" "&page={page}" "&sort_by={sort_by}" - "&sort_type={sort_type}") + "&sort_type={sort_type}", + timeout=15) + + def get_url(self, **kwargs) -> str: + """Get URL with API parameters.""" + return super().get_url(limit=self.limit, page=self.page, + sort_by=self.sort_by, sort_type=self.sort_type, **kwargs) - def get_url(self, **kwargs): - return super().get_url(limit=self.limit, page=self.page, sort_by=self.sort_by, sort_type=self.sort_type, **kwargs) # From proxy-list.download class ProxyListDownloadScraper(Scraper): + """Scraper for proxy-list.download API.""" - def __init__(self, method, anon): + def __init__(self, method: str, anon: str): self.anon = anon - super().__init__(method, "https://www.proxy-list.download/api/v1/get?type={method}&anon={anon}") + super().__init__(method, "https://www.proxy-list.download/api/v1/get?type={method}&anon={anon}", timeout=15) - def get_url(self, **kwargs): + def get_url(self, **kwargs) -> str: + """Get URL with anonymity level parameter.""" return super().get_url(anon=self.anon, **kwargs) # For websites using table in html class GeneralTableScraper(Scraper): + """Scraper for websites that use HTML tables to display proxies.""" - async def handle(self, response): - soup = BeautifulSoup(response.text, "html.parser") - proxies = set() - table = soup.find("table", attrs={"class": "table table-striped table-bordered"}) - for row in table.findAll("tr"): - count = 0 - proxy = "" - for cell in row.findAll("td"): - if count == 1: - proxy += ":" + cell.text.replace(" ", "") - proxies.add(proxy) - break - proxy += cell.text.replace(" ", "") - count += 1 - return "\n".join(proxies) + async def handle(self, response: httpx.Response) -> str: + """Parse HTML table to extract proxies.""" + try: + soup = BeautifulSoup(response.text, "html.parser") + proxies: Set[str] = set() + table = soup.find("table", attrs={"class": "table table-striped table-bordered"}) + + if table is None: + logger.debug("No table found with expected class") + return "" + + for row in table.find_all("tr"): + cells = row.find_all("td") + if len(cells) >= 2: + ip = cells[0].get_text(strip=True).replace(" ", "") + port = cells[1].get_text(strip=True).replace(" ", "") + if ip and port: + proxies.add(f"{ip}:{port}") + + return "\n".join(proxies) + except Exception as e: + logger.debug(f"Error parsing HTML table: {e}") + return "" # For websites using div in html class GeneralDivScraper(Scraper): + """Scraper for websites that use HTML divs to display proxies.""" - async def handle(self, response): - soup = BeautifulSoup(response.text, "html.parser") - proxies = set() - table = soup.find("div", attrs={"class": "list"}) - for row in table.findAll("div"): - count = 0 - proxy = "" - for cell in row.findAll("div", attrs={"class": "td"}): - if count == 2: - break - proxy += cell.text+":" - count += 1 - proxy = proxy.rstrip(":") - proxies.add(proxy) - return "\n".join(proxies) + async def handle(self, response: httpx.Response) -> str: + """Parse HTML divs to extract proxies.""" + try: + soup = BeautifulSoup(response.text, "html.parser") + proxies: Set[str] = set() + container = soup.find("div", attrs={"class": "list"}) + + if container is None: + logger.debug("No div found with class 'list'") + return "" + + for row in container.find_all("div"): + cells = row.find_all("div", attrs={"class": "td"}) + if len(cells) >= 2: + ip = cells[0].get_text(strip=True) + port = cells[1].get_text(strip=True) + if ip and port: + proxies.add(f"{ip}:{port}") + + return "\n".join(proxies) + except Exception as e: + logger.debug(f"Error parsing HTML divs: {e}") + return "" # For scraping live proxylist from github class GitHubScraper(Scraper): + """Scraper for GitHub raw proxy lists.""" - async def handle(self, response): - tempproxies = response.text.split("\n") + async def handle(self, response: httpx.Response) -> str: + """Parse GitHub raw proxy list format.""" + try: + temp_proxies = response.text.strip().split("\n") + proxies: Set[str] = set() + + for proxy_line in temp_proxies: + proxy_line = proxy_line.strip() + if not proxy_line: + continue + + # Handle different formats: "type://ip:port" or just "ip:port" + if self.method in proxy_line: + # Extract IP:port from lines like "http://1.2.3.4:8080" + if "//" in proxy_line: + proxy = proxy_line.split("//")[-1] + else: + proxy = proxy_line + + # Validate IP:port format + if re.match(r"\d{1,3}(?:\.\d{1,3}){3}:\d{1,5}", proxy): + proxies.add(proxy) + + return "\n".join(proxies) + except Exception as e: + logger.debug(f"Error parsing GitHub proxy list: {e}") + return "" + +# For scraping from proxy list APIs with JSON response +class ProxyListApiScraper(Scraper): + """Scraper for APIs that return JSON proxy lists.""" + + def _extract_proxy_from_item(self, item: dict) -> Optional[str]: + """Extract proxy string from a single item.""" + if not isinstance(item, dict): + return None + + ip = item.get('ip') + port = item.get('port') + if ip and port: + return f"{ip}:{port}" + return None + + def _process_list_data(self, data: list) -> Set[str]: + """Process list-type JSON data.""" proxies = set() - for prxy in tempproxies: - if self.method in prxy: - proxies.add(prxy.split("//")[-1]) - - return "\n".join(proxies) - + for item in data: + proxy = self._extract_proxy_from_item(item) + if proxy: + proxies.add(proxy) + return proxies + + def _process_dict_data(self, data: dict) -> Set[str]: + """Process dict-type JSON data.""" + proxies = set() + if 'data' in data and isinstance(data['data'], list): + for item in data['data']: + proxy = self._extract_proxy_from_item(item) + if proxy: + proxies.add(proxy) + return proxies + async def handle(self, response: httpx.Response) -> str: + """Parse JSON API response for proxies.""" + try: + data = response.json() + proxies: Set[str] = set() + + # Handle different JSON structures + if isinstance(data, list): + proxies = self._process_list_data(data) + elif isinstance(data, dict): + proxies = self._process_dict_data(data) + + return "\n".join(proxies) + except Exception as e: + logger.debug(f"Error parsing JSON API response: {e}") + return "" + +# For scraping from plain text sources +class PlainTextScraper(Scraper): + """Scraper for plain text proxy lists.""" + + async def handle(self, response: httpx.Response) -> str: + """Parse plain text proxy list.""" + try: + proxies: Set[str] = set() + lines = response.text.strip().split('\n') + + for line in lines: + line = line.strip() + if not line or line.startswith('#'): + continue + + # Look for IP:port pattern + if re.match(r"\d{1,3}(?:\.\d{1,3}){3}:\d{1,5}", line): + proxies.add(line) + + return "\n".join(proxies) + except Exception as e: + logger.debug(f"Error parsing plain text proxy list: {e}") + return "" + + +# Improved scrapers list with better organization scrapers = [ + # Direct API scrapers SpysMeScraper("http"), SpysMeScraper("socks"), ProxyScrapeScraper("http"), ProxyScrapeScraper("socks4"), ProxyScrapeScraper("socks5"), GeoNodeScraper("socks"), + + # Download API scrapers ProxyListDownloadScraper("https", "elite"), ProxyListDownloadScraper("http", "elite"), ProxyListDownloadScraper("http", "transparent"), ProxyListDownloadScraper("http", "anonymous"), + + # HTML table scrapers GeneralTableScraper("https", "http://sslproxies.org"), GeneralTableScraper("http", "http://free-proxy-list.net"), GeneralTableScraper("http", "http://us-proxy.org"), GeneralTableScraper("socks", "http://socks-proxy.net"), + + # HTML div scrapers GeneralDivScraper("http", "https://freeproxy.lunaproxy.com/"), + + # GitHub raw list scrapers (established sources) GitHubScraper("http", "https://raw.githubusercontent.com/proxifly/free-proxy-list/main/proxies/all/data.txt"), GitHubScraper("socks4", "https://raw.githubusercontent.com/proxifly/free-proxy-list/main/proxies/all/data.txt"), GitHubScraper("socks5", "https://raw.githubusercontent.com/proxifly/free-proxy-list/main/proxies/all/data.txt"), @@ -165,78 +427,230 @@ async def handle(self, response): GitHubScraper("http", "https://raw.githubusercontent.com/zloi-user/hideip.me/main/http.txt"), GitHubScraper("socks4", "https://raw.githubusercontent.com/zloi-user/hideip.me/main/socks4.txt"), GitHubScraper("socks5", "https://raw.githubusercontent.com/zloi-user/hideip.me/main/socks5.txt"), + + # Additional GitHub sources + GitHubScraper("http", "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt"), + GitHubScraper("socks4", "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt"), + GitHubScraper("socks5", "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt"), + GitHubScraper("http", "https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/http.txt"), + GitHubScraper("https", "https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/https.txt"), + GitHubScraper("socks4", "https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks4.txt"), + GitHubScraper("socks5", "https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks5.txt"), + GitHubScraper("http", "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-http.txt"), + GitHubScraper("https", "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-https.txt"), + GitHubScraper("socks4", "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-socks4.txt"), + GitHubScraper("socks5", "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-socks5.txt"), + GitHubScraper("http", "https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt"), + GitHubScraper("http", "https://raw.githubusercontent.com/sunny9577/proxy-scraper/master/proxies.txt"), + GitHubScraper("http", "https://raw.githubusercontent.com/roosterkid/openproxylist/main/HTTPS_RAW.txt"), + GitHubScraper("socks4", "https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS4_RAW.txt"), + GitHubScraper("socks5", "https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS5_RAW.txt"), + GitHubScraper("http", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/http.txt"), + GitHubScraper("https", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/https.txt"), + GitHubScraper("socks4", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks4.txt"), + GitHubScraper("socks5", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks5.txt"), + + # Plain text sources + PlainTextScraper("http", "https://www.proxyscan.io/download?type=http"), + PlainTextScraper("socks4", "https://www.proxyscan.io/download?type=socks4"), + PlainTextScraper("socks5", "https://www.proxyscan.io/download?type=socks5"), + PlainTextScraper("http", "https://raw.githubusercontent.com/almroot/proxylist/master/list.txt"), + PlainTextScraper("http", "https://raw.githubusercontent.com/aslisk/proxyhttps/main/https.txt"), + PlainTextScraper("http", "https://raw.githubusercontent.com/proxy4parsing/proxy-list/main/http.txt"), + + # Additional table scrapers + GeneralTableScraper("http", "https://proxyspace.pro/http.txt"), + GeneralTableScraper("socks4", "https://proxyspace.pro/socks4.txt"), + GeneralTableScraper("socks5", "https://proxyspace.pro/socks5.txt"), + + # API-based scrapers + ProxyListApiScraper("http", "https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc&protocols=http"), + ProxyListApiScraper("socks5", "https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc&protocols=socks5"), ] -def verbose_print(verbose, message): + +def verbose_print(verbose: bool, message: str) -> None: + """Print message if verbose mode is enabled.""" if verbose: print(message) -async def scrape(method, output, verbose): - now = time.time() + +def _determine_scraping_methods(method: str) -> List[str]: + """Determine which methods to scrape based on input.""" methods = [method] if method == "socks": - methods += ["socks4", "socks5"] + methods.extend(["socks4", "socks5"]) + return methods + +def _get_scrapers_for_methods(methods: List[str]) -> List: + """Get scrapers that match the specified methods.""" proxy_scrapers = [s for s in scrapers if s.method in methods] if not proxy_scrapers: - raise ValueError("Method not supported") - verbose_print(verbose, "Scraping proxies...") - proxies = [] - - tasks = [] - client = httpx.AsyncClient(follow_redirects=True) + raise ValueError(f"Methods '{methods}' not supported") + return proxy_scrapers + +def _create_http_client_config() -> Dict: + """Create HTTP client configuration.""" + return { + "follow_redirects": True, + "timeout": 30.0, + "limits": httpx.Limits(max_keepalive_connections=20, max_connections=100), + "headers": { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + }, + } + +def _print_source_statistics(verbose: bool, source_stats: Dict) -> None: + """Print source statistics if verbose mode is enabled.""" + if not verbose: + return + + print("\n๐Ÿ“Š Source Statistics:") + print("-" * 50) + total_bad_filtered = 0 + total_invalid_filtered = 0 + for source, stats in source_stats.items(): + print(f"{source}: {stats['valid']} valid, {stats['filtered_bad']} bad IPs, {stats['filtered_invalid']} invalid") + total_bad_filtered += stats['filtered_bad'] + total_invalid_filtered += stats['filtered_invalid'] + print(f"\nTotal filtered: {total_bad_filtered} bad IPs (CDN/etc), {total_invalid_filtered} invalid format") + +async def scrape(method: str, output: str, verbose: bool) -> None: + """ + Main scraping function that coordinates all scrapers. + + Args: + method: Proxy type to scrape (http, https, socks, socks4, socks5) + output: Output file path + verbose: Enable verbose logging + """ + start_time = time.time() + + # Setup scraping parameters + methods = _determine_scraping_methods(method) + proxy_scrapers = _get_scrapers_for_methods(methods) + client_config = _create_http_client_config() + + verbose_print(verbose, f"Scraping proxies using {len(proxy_scrapers)} sources...") + all_proxies: List[str] = [] + source_stats: Dict[str, Dict[str, int]] = {} - async def scrape_scraper(scraper): + async def scrape_source(scraper, client) -> None: + """Scrape from a single source.""" try: - verbose_print(verbose, f"Looking {scraper.get_url()}...") - proxies.extend(await scraper.scrape(client)) - except Exception: - pass - - for scraper in proxy_scrapers: - tasks.append(asyncio.ensure_future(scrape_scraper(scraper))) - - await asyncio.gather(*tasks) - await client.aclose() - - proxies = set(proxies) - verbose_print(verbose, f"Writing {len(proxies)} proxies to file...") - with open(output, "w") as f: - f.write("\n".join(proxies)) - verbose_print(verbose, "Done!") - verbose_print(verbose, f"Took {time.time() - now} seconds") - -def main(): - parser = argparse.ArgumentParser() + verbose_print(verbose, f"Scraping from {scraper.get_url()}...") + proxies, stats = await scraper.scrape(client) + all_proxies.extend(proxies) + source_stats[scraper.source_name] = stats + verbose_print(verbose, f"Found {len(proxies)} valid proxies from {scraper.source_name} ({stats['filtered_bad']} bad IPs filtered, {stats['filtered_invalid']} invalid filtered)") + except Exception as e: + logger.debug(f"Failed to scrape from {scraper.source_name}: {e}") + source_stats[scraper.source_name] = {"total": 0, "filtered_bad": 0, "filtered_invalid": 0, "valid": 0} + + # Execute all scrapers concurrently + async with httpx.AsyncClient(**client_config) as client: + tasks = [scrape_source(scraper, client) for scraper in proxy_scrapers] + await asyncio.gather(*tasks, return_exceptions=True) + + # Process results + unique_proxies: Set[str] = set(all_proxies) + _print_source_statistics(verbose, source_stats) + + # Write results to file + verbose_print(verbose, f"Writing {len(unique_proxies)} unique proxies to {output}...") + try: + with open(output, "w", encoding="utf-8") as f: + f.write("\n".join(sorted(unique_proxies)) + "\n") + except IOError as e: + logger.error(f"Failed to write to output file {output}: {e}") + raise + + elapsed_time = time.time() - start_time + verbose_print(verbose, f"Scraping completed in {elapsed_time:.2f} seconds") + verbose_print(verbose, f"Found {len(unique_proxies)} unique valid proxies") + +def _setup_argument_parser(): + """Set up and return the argument parser.""" + parser = argparse.ArgumentParser( + description="Scrape proxies from multiple sources", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s -p http -v # Scrape HTTP proxies with verbose output + %(prog)s -p socks -o socks.txt # Scrape SOCKS proxies to custom file + %(prog)s -p https --verbose # Scrape HTTPS proxies with verbose output + """, + ) + + supported_methods = sorted(set(s.method for s in scrapers)) + parser.add_argument( - "-p", - "--proxy", - help="Supported proxy type: " + ", ".join(sorted(set([s.method for s in scrapers]))), + "-p", "--proxy", required=True, + choices=supported_methods, + help=f"Proxy type to scrape. Supported types: {', '.join(supported_methods)}", ) parser.add_argument( - "-o", - "--output", - help="Output file name to save .txt file", + "-o", "--output", default="output.txt", + help="Output file name to save proxies (default: %(default)s)", ) parser.add_argument( - "-v", - "--verbose", - help="Increase output verbosity", + "-v", "--verbose", action="store_true", + help="Enable verbose output", ) - args = parser.parse_args() - - if sys.version_info >= (3, 7) and platform.system() == 'Windows': - loop = asyncio.get_event_loop() - loop.run_until_complete(scrape(args.proxy, args.output, args.verbose)) - loop.close() - elif sys.version_info >= (3, 7): + parser.add_argument( + "--debug", + action="store_true", + help="Enable debug logging", + ) + + return parser + +def _configure_logging(args): + """Configure logging based on command line arguments.""" + if args.debug: + logging.getLogger().setLevel(logging.DEBUG) + elif args.verbose: + logging.getLogger().setLevel(logging.INFO) + else: + logging.getLogger().setLevel(logging.WARNING) + +def _run_scraping(args): + """Run the scraping process with appropriate event loop handling.""" + if sys.version_info >= (3, 7): + if platform.system() == 'Windows': + # Windows-specific asyncio policy for better compatibility + asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) asyncio.run(scrape(args.proxy, args.output, args.verbose)) else: + # Fallback for Python < 3.7 loop = asyncio.get_event_loop() - loop.run_until_complete(scrape(args.proxy, args.output, args.verbose)) - loop.close() + try: + loop.run_until_complete(scrape(args.proxy, args.output, args.verbose)) + finally: + loop.close() + +def main() -> None: + """Main entry point for the proxy scraper.""" + parser = _setup_argument_parser() + args = parser.parse_args() + + _configure_logging(args) + + try: + _run_scraping(args) + except KeyboardInterrupt: + print("\nScraping interrupted by user") + sys.exit(1) + except Exception as e: + logger.error(f"Scraping failed: {e}") + if args.debug: + import traceback + traceback.print_exc() + sys.exit(1) + if __name__ == "__main__": main() diff --git a/requirements.txt b/requirements.txt index 97b770a..c6a4ddd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ -beautifulsoup4==4.11.1 -requests==2.27.1 -colorama==0.4.4 -urllib3==1.26.9 -httpx -socks -PySocks \ No newline at end of file +beautifulsoup4>=4.11.1,<5.0.0 +requests>=2.27.1,<3.0.0 +colorama>=0.4.4,<1.0.0 +urllib3>=1.26.9,<3.0.0 +httpx>=0.23.0,<1.0.0 +PySocks>=1.7.1,<2.0.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 575d218..8842a4b 100644 --- a/setup.py +++ b/setup.py @@ -2,12 +2,12 @@ setup( name='proxyz', - version='0.2.0', + version='0.4.0', py_modules=['proxyScraper', 'proxyChecker'], install_requires=[ - 'httpx', - 'beautifulsoup4', - 'pysocks', + 'httpx>=0.23.0,<1.0.0', + 'beautifulsoup4>=4.11.1,<5.0.0', + 'pysocks>=1.7.1,<2.0.0', ], entry_points={ 'console_scripts': [ @@ -21,14 +21,18 @@ }, author='Nima Akbarzadeh', author_email='iw4p@protonmail.com', - description='scrape proxies from more than 5 different sources and check which ones are still alive', + description='scrape proxies from more than 12 different sources and check which ones are still alive', long_description=open('README.md').read(), long_description_content_type='text/markdown', url='https://github.com/iw4p/proxy-scraper', classifiers=[ 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', ], - python_requires='>=3.7', + python_requires='>=3.9', ) diff --git a/user_agents.txt b/user_agents.txt index ae82bd5..b9e20a0 100644 --- a/user_agents.txt +++ b/user_agents.txt @@ -1,3 +1,40 @@ +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:133.0) Gecko/20100101 Firefox/133.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:132.0) Gecko/20100101 Firefox/132.0 +Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0 +Mozilla/5.0 (X11; Linux x86_64; rv:132.0) Gecko/20100101 Firefox/132.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15 +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 OPR/117.0.0.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 OPR/116.0.0.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 OPR/117.0.0.0 +Mozilla/5.0 (iPhone; CPU iPhone OS 18_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Mobile/15E148 Safari/604.1 +Mozilla/5.0 (iPhone; CPU iPhone OS 18_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Mobile/15E148 Safari/604.1 +Mozilla/5.0 (iPad; CPU OS 18_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Mobile/15E148 Safari/604.1 +Mozilla/5.0 (iPad; CPU OS 18_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Mobile/15E148 Safari/604.1 +Mozilla/5.0 (iPhone; CPU iPhone OS 18_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/131.0.6778.73 Mobile/15E148 Safari/604.1 +Mozilla/5.0 (iPhone; CPU iPhone OS 18_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/130.0.6723.90 Mobile/15E148 Safari/604.1 +Mozilla/5.0 (Android 14; Mobile; rv:133.0) Gecko/133.0 Firefox/133.0 +Mozilla/5.0 (Android 13; Mobile; rv:132.0) Gecko/132.0 Firefox/132.0 +Mozilla/5.0 (Linux; Android 14; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36 +Mozilla/5.0 (Linux; Android 13; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Mobile Safari/537.36 +Mozilla/5.0 (Linux; Android 14; Pixel 8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36 +Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Mobile Safari/537.36 +Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 +Mozilla/5.0 (Windows NT 11.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0 +Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0 +Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0 +Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:132.0) Gecko/20100101 Firefox/132.0 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/37.0.2062.94 Chrome/37.0.2062.94 Safari/537.36 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko From 0c1b4db03e07ded28fb92cdf0356b30d626c9f9a Mon Sep 17 00:00:00 2001 From: Dikky Hardian <30888372+FosterG4@users.noreply.github.com> Date: Sat, 26 Jul 2025 01:34:17 +0700 Subject: [PATCH 2/5] Update setup.py version mismatch --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8842a4b..4f12769 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='proxyz', - version='0.4.0', + version='0.3.0', py_modules=['proxyScraper', 'proxyChecker'], install_requires=[ 'httpx>=0.23.0,<1.0.0', From 364f155f18e3709b95494233c85b8650ff39fb8a Mon Sep 17 00:00:00 2001 From: FosterG4 Date: Sat, 26 Jul 2025 01:45:49 +0700 Subject: [PATCH 3/5] Fix Unicode encoding issues and update CI workflow - Replace emoji characters with ASCII equivalents in all Python files - Prevents UnicodeEncodeError in Windows CI environment - Update CI workflow to use Python 3.8-3.12 (3.7 no longer available) - Update GitHub Actions to latest versions (checkout@v4, setup-python@v4) - Ensures cross-platform compatibility for all CI environments --- .github/workflows/tests.yml | 12 ++++++------ proxyChecker.py | 28 ++++++++++++++-------------- proxyScraper.py | 2 +- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3004d06..0b962d4 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -15,20 +15,20 @@ jobs: os: - ubuntu-latest python-version: - - '3.7' - '3.8' - '3.9' - '3.10' - - 'pypy-3.8' + - '3.11' + - '3.12' include: - os: windows-latest - python-version: '3.10' + python-version: '3.11' - os: macos-latest - python-version: '3.10' + python-version: '3.11' steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Install Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/proxyChecker.py b/proxyChecker.py index 6e3ee80..c1fa9a8 100644 --- a/proxyChecker.py +++ b/proxyChecker.py @@ -17,7 +17,7 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) -#fallback user agents (will be extended from user_agents.txt if available) +# Fallback user agents (will be extended from user_agents.txt if available) user_agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0", @@ -153,7 +153,7 @@ def _check_socks_proxy(self, site: str, timeout: int, verbose: bool, start_time: end_time = time() time_taken = end_time - start_time - verbose_print(verbose, f"โœ“ Proxy {self.proxy} ({self.method.upper()}) is valid, time: {time_taken:.2f}s") + verbose_print(verbose, f"[+] Proxy {self.proxy} ({self.method.upper()}) is valid, time: {time_taken:.2f}s") return True, time_taken, None finally: @@ -162,7 +162,7 @@ def _check_socks_proxy(self, site: str, timeout: int, verbose: bool, start_time: except Exception as e: socket.socket = original_socket # Ensure cleanup even on error - verbose_print(verbose, f"โœ— Proxy {self.proxy} ({self.method.upper()}) failed: {e}") + verbose_print(verbose, f"[-] Proxy {self.proxy} ({self.method.upper()}) failed: {e}") return False, 0.0, e def _check_http_proxy(self, site: str, timeout: int, user_agent: str, verbose: bool, start_time: float) -> Tuple[bool, float, Optional[Exception]]: @@ -190,11 +190,11 @@ def _check_http_proxy(self, site: str, timeout: int, user_agent: str, verbose: b end_time = time() time_taken = end_time - start_time - verbose_print(verbose, f"โœ“ Proxy {self.proxy} ({self.method.upper()}) is valid, time: {time_taken:.2f}s") + verbose_print(verbose, f"[+] Proxy {self.proxy} ({self.method.upper()}) is valid, time: {time_taken:.2f}s") return True, time_taken, None except Exception as e: - verbose_print(verbose, f"โœ— Proxy {self.proxy} ({self.method.upper()}) failed: {e}") + verbose_print(verbose, f"[-] Proxy {self.proxy} ({self.method.upper()}) failed: {e}") return False, 0.0, e def __str__(self) -> str: @@ -406,7 +406,7 @@ def check(file: str, timeout: int, method: str, site: str, verbose: bool, random print(f"Average time per proxy: {elapsed_time/len(proxies):.2f}s") if len(valid_proxies) == 0: - print("โš ๏ธ No working proxies found. Consider:") + print("WARNING: No working proxies found. Consider:") print(" - Increasing timeout value") print(" - Trying a different target site") print(" - Using fresh proxy list") @@ -519,13 +519,13 @@ def main() -> None: site = _configure_logging_and_validate_args(args) # Display startup information - print("๐Ÿ” Proxy Checker v2.0") - print(f"๐Ÿ“ Proxy file: {args.list}") - print(f"๐ŸŽฏ Target site: {site}") - print(f"โฑ๏ธ Timeout: {args.timeout}s") - print(f"๐Ÿ”ง Method: {args.proxy.upper()}") - print(f"๐Ÿงต Max threads: {args.max_threads}") - print(f"๐Ÿ‘ค User agents: {len(user_agents)} available") + print("*** Proxy Checker v2.0 ***") + print(f"Proxy file: {args.list}") + print(f"Target site: {site}") + print(f"Timeout: {args.timeout}s") + print(f"Method: {args.proxy.upper()}") + print(f"Max threads: {args.max_threads}") + print(f"User agents: {len(user_agents)} available") print("=" * 60) try: @@ -539,7 +539,7 @@ def main() -> None: ) except KeyboardInterrupt: - print("\nโš ๏ธ Operation interrupted by user") + print("\nWARNING: Operation interrupted by user") sys.exit(1) except Exception as e: logger.error(f"Proxy checking failed: {e}") diff --git a/proxyScraper.py b/proxyScraper.py index c774462..ec5c56f 100644 --- a/proxyScraper.py +++ b/proxyScraper.py @@ -505,7 +505,7 @@ def _print_source_statistics(verbose: bool, source_stats: Dict) -> None: if not verbose: return - print("\n๐Ÿ“Š Source Statistics:") + print("\n*** Source Statistics ***") print("-" * 50) total_bad_filtered = 0 total_invalid_filtered = 0 From 86d0f7833b9fce7b78523ce59ca5d5bac82b2a27 Mon Sep 17 00:00:00 2001 From: FosterG4 Date: Sat, 26 Jul 2025 23:21:54 +0700 Subject: [PATCH 4/5] Fix complexity issues and update ProxyScrape API to v4 --- .flake8 | 4 +- .github/workflows/tests.yml | 2 +- README.md | 99 ++++++++-- proxyChecker.py | 84 ++++++--- proxyScraper.py | 367 +++++++++++++++++++++++------------- 5 files changed, 372 insertions(+), 184 deletions(-) diff --git a/.flake8 b/.flake8 index 71dc6e8..fb8bfa7 100644 --- a/.flake8 +++ b/.flake8 @@ -1,8 +1,8 @@ [flake8] exclude = .git,__pycache__,env,venv,.eggs,.tox,.nox,build,dist -max-line-lenght = 120 +max-line-length = 120 max-complexity = 8 ignore = W,BLK, - E24,E121,E123,E125,E126,E221,E226,E266,E704, + E24,E121,E123,E126,E221,E226,E266,E704, E265,E722,E501,E731,E306,E401,E302,E222,E303, E402,E305,E261,E262,E203,N816 \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0b962d4..44e36d8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -41,6 +41,6 @@ jobs: - name: Run proxyScraper run: python3 proxyScraper.py -p http - name: Run proxyChecker - run: python3 proxyChecker.py -t 20 -s google.com -l output.txt + run: python3 proxyChecker.py -t 20 -s google.com -l output.txt --limit 10 - name: Run proxyGeolocation run: python3 proxyGeolocation.py -i 8.8.8.8 \ No newline at end of file diff --git a/README.md b/README.md index cef17f5..f3b69b9 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Tests](https://github.com/iw4p/proxy-scraper/actions/workflows/tests.yml/badge.svg)](https://github.com/iw4p/proxy-scraper/actions/workflows/tests.yml) [![Downloads](https://static.pepy.tech/badge/proxyz)](https://pepy.tech/project/proxyz) -**Fast, reliable proxy scraper that collects 30K+ HTTP/HTTPS/SOCKS proxies from 24+ sources in seconds.** +**Fast, reliable proxy scraper that collects 30K+ HTTP/HTTPS/SOCKS proxies from 5+ sources in seconds.** โœจ **Features:** - โšก **Fast scraping** - All sources scraped concurrently @@ -56,18 +56,49 @@ python proxyChecker.py --help # Get HTTP proxies (basic) proxy_scraper -p http -# Get SOCKS5 proxies with detailed output -proxy_scraper -p socks5 -v +# Get HTTPS proxies +proxy_scraper -p https -# Save to custom file -proxy_scraper -p http -o my_proxies.txt -v +# Get SOCKS4 proxies +proxy_scraper -p socks4 + +# Get SOCKS5 proxies +proxy_scraper -p socks5 + +# Get all SOCKS proxies (SOCKS4 + SOCKS5) +proxy_scraper -p socks + +# Save to custom file (example: HTTP) +proxy_scraper -p http -o output.txt -v + +# Save HTTPS proxies with verbose output +proxy_scraper -p https -v -o output.txt + +# Save SOCKS4 proxies +proxy_scraper -p socks4 -o output.txt + +# Save SOCKS5 proxies +proxy_scraper -p socks5 -o output.txt ``` + ### Step 2: Check Proxy Quality ```bash -# Test scraped proxies (basic) +# Test scraped HTTP proxies (basic) proxy_checker -l output.txt -t 10 +# Test HTTP proxies +proxy_checker -p http -l output.txt -t 10 + +# Test HTTPS proxies +proxy_checker -p https -l output.txt -t 10 + +# Test SOCKS4 proxies +proxy_checker -p socks4 -l output.txt -t 10 + +# Test SOCKS5 proxies +proxy_checker -p socks5 -l output.txt -t 10 + # Test against specific site with verbose output proxy_checker -l output.txt -s https://google.com -v @@ -78,12 +109,30 @@ proxy_checker -l output.txt -r -v ### Step 3: Complete Workflow Example ```bash # 1. Scrape HTTP proxies -proxy_scraper -p http -v -o fresh_proxies.txt +proxy_scraper -p http -v -o output.txt + +# 2. Scrape HTTPS proxies +proxy_scraper -p https -v -o output.txt + +# 3. Scrape SOCKS4 proxies +proxy_scraper -p socks4 -v -o output.txt + +# 4. Scrape SOCKS5 proxies +proxy_scraper -p socks5 -v -o output.txt + +# 5. Check HTTP proxies +proxy_checker -l output.txt -t 15 -v -# 2. Check their quality -proxy_checker -l fresh_proxies.txt -t 15 -v +# 6. Check HTTPS proxies +proxy_checker -l output.txt -t 15 -v -# 3. Result: output.txt contains only working proxies +# 7. Check SOCKS4 proxies +proxy_checker -l output.txt -t 15 -v + +# 8. Check SOCKS5 proxies +proxy_checker -l output.txt -t 15 -v + +# 9. Result: output.txt contains only working proxies (for each type) ``` ## Supported Proxy Types @@ -100,14 +149,14 @@ We collect proxies from **24 sources**: - spys.me, free-proxy-list.net, proxyscrape.com, geonode.com - sslproxies.org, us-proxy.org, socks-proxy.net - proxy-list.download, proxyscan.io, proxyspace.pro -- freeproxy.lunaproxy.com +- freeproxy.lunaproxy.com, more **๐Ÿ“ฆ GitHub Repositories (13 sources)** - proxifly/free-proxy-list, monosans/proxy-list, TheSpeedX/PROXY-List - jetkai/proxy-list, roosterkid/openproxylist, mmpx12/proxy-list - ShiftyTR/Proxy-List, clarketm/proxy-list, sunny9577/proxy-scraper - zloi-user/hideip.me, almroot/proxylist, aslisk/proxyhttps -- proxy4parsing/proxy-list +- proxy4parsing/proxy-list, more ## Advanced Usage @@ -121,6 +170,8 @@ Options: -p, --proxy Proxy type: http, https, socks, socks4, socks5 -o, --output Output file (default: output.txt) -v, --verbose Show detailed statistics + -l, --list Input proxy file (default: output.txt) + -h, --help Show this help message ``` **Checking:** @@ -129,10 +180,13 @@ proxy_checker [-l input.txt] [-t timeout] [-s site] [-v] Options: -l, --list Input proxy file (default: output.txt) + -p, --proxy Proxy type: http, https, socks, socks4, socks5 + -o, --output Output file (default: output.txt) -t, --timeout Timeout in seconds (default: 20) -s, --site Test site (default: https://google.com) -r, --random_agent Use random user agents -v, --verbose Show detailed progress + --max-threads Maximum concurrent threads (default: 10) ``` ### From Source Code @@ -160,15 +214,20 @@ python proxyChecker.py -l output.txt -v ## Example Output ```bash -Scraping proxies using 24 sources... -๐Ÿ“Š Source Statistics: +*** Source Statistics *** -------------------------------------------------- -ProxyScrapeScraper: 18769 valid, 16408 bad IPs filtered -PlainTextScraper: 13516 valid, 5515 bad IPs filtered -GitHubScraper: 1767 valid, 739 bad IPs filtered -... -Total filtered: 22177 bad IPs (CDN/etc), 1 invalid format -Found 30938 unique valid proxies +PlainTextScraper: 0 valid, 0 bad IPs, 0 invalid +GeneralTableScraper: 0 valid, 0 bad IPs, 0 invalid +ProxyScrapeScraper: 1666 valid, 334 bad IPs, 0 invalid +GitHubScraper: 0 valid, 0 bad IPs, 0 invalid +ProxyListApiScraper: 261 valid, 0 bad IPs, 0 invalid +GeneralDivScraper: 0 valid, 0 bad IPs, 0 invalid +SpysMeScraper: 400 valid, 0 bad IPs, 0 invalid + +Total filtered: 334 bad IPs (CDN/etc), 0 invalid format +Writing 37030 unique proxies to output.txt... +Scraping completed in 13.13 seconds +Found 37030 unique valid proxies ``` ## ๐ŸŒ Proxy Geolocation & Analysis diff --git a/proxyChecker.py b/proxyChecker.py index c1fa9a8..f0398b3 100644 --- a/proxyChecker.py +++ b/proxyChecker.py @@ -238,13 +238,14 @@ def _read_proxy_file(file_path: str) -> List[str]: sys.exit(1) -def load_proxies_from_file(file_path: str, method: str) -> List[Proxy]: +def load_proxies_from_file(file_path: str, method: str, limit: Optional[int] = None) -> List[Proxy]: """ Load proxies from file and create Proxy objects. Args: file_path: Path to proxy list file method: Proxy method to use + limit: Maximum number of proxies to load (None for all) Returns: List of valid Proxy objects @@ -255,19 +256,22 @@ def load_proxies_from_file(file_path: str, method: str) -> List[Proxy]: lines = _read_proxy_file(file_path) for line_num, line in enumerate(lines, 1): + # Check if we've reached the limit + if limit is not None and len(proxies) >= limit: + logger.info(f"Reached limit of {limit} proxies, stopping load") + break + proxy = _process_proxy_line(line, line_num, method) if proxy is not None: proxies.append(proxy) else: if line.strip() and not line.strip().startswith('#'): invalid_count += 1 - + if invalid_count > 0: logger.warning(f"Skipped {invalid_count} invalid proxy entries") return proxies - - def save_valid_proxies(file_path: str, valid_proxies: List[Proxy]) -> None: """ Save valid proxies back to file. @@ -291,10 +295,10 @@ def save_valid_proxies(file_path: str, valid_proxies: List[Proxy]) -> None: raise -def _prepare_checking_environment(file: str, method: str, site: str, timeout: int, random_user_agent: bool) -> Tuple[List[Proxy], str, int]: +def _prepare_checking_environment(file: str, method: str, site: str, timeout: int, random_user_agent: bool, limit: Optional[int] = None) -> Tuple[List[Proxy], str, int]: """Prepare the environment for proxy checking.""" print(f"Loading proxies from {file}...") - proxies = load_proxies_from_file(file, method) + proxies = load_proxies_from_file(file, method, limit) print(f"Loaded {len(proxies)} valid proxies for checking") if not proxies: @@ -346,7 +350,7 @@ def check_single_proxy(proxy: Proxy) -> None: return check_single_proxy -def check(file: str, timeout: int, method: str, site: str, verbose: bool, random_user_agent: bool) -> None: +def check(file: str, timeout: int, method: str, site: str, verbose: bool, random_user_agent: bool, limit: Optional[int] = None) -> None: """ Main proxy checking function. @@ -357,12 +361,13 @@ def check(file: str, timeout: int, method: str, site: str, verbose: bool, random site: Target website for testing verbose: Enable verbose output random_user_agent: Use random user agent per proxy + limit: Maximum number of proxies to check """ start_time = time() # Prepare checking environment proxies, base_user_agent, max_threads = _prepare_checking_environment( - file, method, site, timeout, random_user_agent, + file, method, site, timeout, random_user_agent, limit, ) if not proxies: @@ -379,24 +384,12 @@ def check(file: str, timeout: int, method: str, site: str, verbose: bool, random random_user_agent, base_user_agent, len(proxies), verbose, ) - # Execute checking with thread pool - with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: - futures = [executor.submit(check_single_proxy, proxy) for proxy in proxies] - - try: - concurrent.futures.wait(futures, timeout=None) - except KeyboardInterrupt: - print("\nChecking interrupted by user") - executor.shutdown(wait=False) - return - - # Save results - save_valid_proxies(file, valid_proxies) - - # Final statistics + _run_proxy_check_threadpool( + check_single_proxy, proxies, valid_proxies, checked_count_ref, file, start_time, + ) elapsed_time = time() - start_time + # Final statistics success_rate = (len(valid_proxies) / len(proxies)) * 100 if proxies else 0 - print("-" * 60) print("Proxy checking completed!") print(f"Total checked: {len(proxies)}") @@ -404,7 +397,6 @@ def check(file: str, timeout: int, method: str, site: str, verbose: bool, random print(f"Success rate: {success_rate:.1f}%") print(f"Time taken: {elapsed_time:.2f} seconds") print(f"Average time per proxy: {elapsed_time/len(proxies):.2f}s") - if len(valid_proxies) == 0: print("WARNING: No working proxies found. Consider:") print(" - Increasing timeout value") @@ -412,6 +404,32 @@ def check(file: str, timeout: int, method: str, site: str, verbose: bool, random print(" - Using fresh proxy list") +def _run_proxy_check_threadpool(check_single_proxy, proxies, valid_proxies, checked_count_ref, file, start_time): + """Helper to run proxy checking in a thread pool, handles KeyboardInterrupt and saving.""" + executor = None + try: + executor = concurrent.futures.ThreadPoolExecutor(max_workers=min(len(proxies), 100)) + futures = [executor.submit(check_single_proxy, proxy) for proxy in proxies] + for _ in concurrent.futures.as_completed(futures): + pass + except KeyboardInterrupt: + print("\n[!] Proxy checking cancelled by user. Stopping threads and saving progress...") + if executor is not None: + try: + executor.shutdown(wait=False, cancel_futures=True) + except Exception: + pass + save_valid_proxies(file, valid_proxies) + elapsed_time = time() - start_time + print("-" * 60) + print(f"Check cancelled. {len(valid_proxies)} valid proxies saved to {file}.") + print(f"Checked: {checked_count_ref[0]} / {len(proxies)} | Time: {elapsed_time:.2f}s") + sys.exit(130) + if executor is not None: + executor.shutdown(wait=True) + save_valid_proxies(file, valid_proxies) + + def _setup_argument_parser() -> argparse.ArgumentParser: """Set up and configure the argument parser.""" parser = argparse.ArgumentParser( @@ -422,11 +440,15 @@ def _setup_argument_parser() -> argparse.ArgumentParser: %(prog)s -p http -t 10 -v # Check HTTP proxies with 10s timeout %(prog)s -p socks4 -l socks.txt -r # Check SOCKS4 with random user agents %(prog)s -p https -s httpbin.org/ip --debug # Check HTTPS proxies against custom site - + %(prog)s -p http --limit 50 -v # Check only the first 50 HTTP proxies + %(prog)s -p socks5 -l proxies.txt -t 30 --max-threads 20 # Check SOCKS5 proxies with 30s timeout and 20 threads Notes: - Dead proxies are automatically removed from the list file - Use --debug for detailed error information - Higher timeout values may find more working proxies but take longer + - Use --limit for quick testing or when you don't want to check all proxies + - Random user agents can help avoid detection by target sites + - Use --max-threads to control concurrency, default is 10 """, ) @@ -470,9 +492,14 @@ def _setup_argument_parser() -> argparse.ArgumentParser: parser.add_argument( "--max-threads", type=int, - default=100, + default=10, help="Maximum number of concurrent threads (default: %(default)s)", ) + parser.add_argument( + "--limit", + type=int, + help="Maximum number of proxies to check (default: check all)", + ) return parser @@ -525,6 +552,8 @@ def main() -> None: print(f"Timeout: {args.timeout}s") print(f"Method: {args.proxy.upper()}") print(f"Max threads: {args.max_threads}") + if args.limit: + print(f"Limit: {args.limit} proxies") print(f"User agents: {len(user_agents)} available") print("=" * 60) @@ -536,6 +565,7 @@ def main() -> None: site=site, verbose=args.verbose, random_user_agent=args.random_agent, + limit=args.limit, ) except KeyboardInterrupt: diff --git a/proxyScraper.py b/proxyScraper.py index ec5c56f..24a2062 100644 --- a/proxyScraper.py +++ b/proxyScraper.py @@ -7,6 +7,7 @@ import sys import time from typing import Dict, List, Optional, Set, Tuple +from urllib.parse import urlparse import httpx from bs4 import BeautifulSoup @@ -15,6 +16,44 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) +# --- Module-level helpers for source statistics --- +def _extract_domain(url): + """Extract domain from URL for statistics.""" + try: + domain = urlparse(url).netloc or urlparse('//' + url).netloc + if not domain: + domain = url + except Exception: + domain = url + return domain + +def _aggregate_domain_stats(source_stats): + """Aggregate statistics by domain.""" + total_bad_filtered = 0 + total_invalid_filtered = 0 + domain_valid = {} + skipped = 0 + for source, stats in source_stats.items(): + url = source.split(": ", 1)[-1] + domain = _extract_domain(url) + if stats['valid'] > 0: + domain_valid[domain] = domain_valid.get(domain, 0) + stats['valid'] + else: + skipped += 1 + total_bad_filtered += stats['filtered_bad'] + total_invalid_filtered += stats['filtered_invalid'] + return domain_valid, skipped, total_bad_filtered, total_invalid_filtered + +def _print_summary(domain_valid, skipped, total_bad_filtered, total_invalid_filtered): + """Print formatted statistics summary.""" + print("\n*** Source Statistics ***") + print("-" * 50) + for domain, valid_count in sorted(domain_valid.items(), key=lambda x: -x[1]): + print(f"{valid_count} valid from {domain}") + if skipped: + print(f"...{skipped} sources returned 0 valid proxies and are hidden...") + print(f"\nTotal filtered: {total_bad_filtered} bad IPs (CDN/etc), {total_invalid_filtered} invalid format") + # Known bad IP ranges to filter out (Cloudflare, major CDNs, etc.) BAD_IP_RANGES = [ # Cloudflare @@ -100,41 +139,41 @@ def filter_proxies(self, proxy_text: str) -> Tuple[Set[str], Dict[str, int]]: line = line.strip() if not line: continue - + stats["total"] += 1 - + # Basic format validation if ':' not in line: stats["filtered_invalid"] += 1 continue - + try: ip, port = line.split(':', 1) ip = ip.strip() port = port.strip() - + # Validate IP format ipaddress.ip_address(ip) - + # Validate port port_num = int(port) if not (1 <= port_num <= 65535): stats["filtered_invalid"] += 1 continue - + # Check if it's a bad IP (CDN, etc.) if is_bad_ip(ip): stats["filtered_bad"] += 1 logger.debug(f"Filtered bad IP from {self.source_name}: {ip}:{port}") continue - + proxies.add(f"{ip}:{port}") stats["valid"] += 1 - + except (ValueError, ipaddress.AddressValueError): stats["filtered_invalid"] += 1 continue - + return proxies, stats async def scrape(self, client: httpx.AsyncClient) -> Tuple[List[str], Dict[str, int]]: @@ -171,48 +210,56 @@ def get_url(self, **kwargs) -> str: raise NotImplementedError(f"Method {self.method} not supported by SpysMeScraper") return super().get_url(mode=mode, **kwargs) + async def handle(self, response: httpx.Response) -> str: + """Parse spys.me format to extract only IP:port.""" + try: + lines = response.text.strip().split('\n') + proxies: Set[str] = set() + + for line in lines: + line = line.strip() + if not line: + continue + + # Skip header lines and comments + if (line.startswith('Proxy list') or + line.startswith('Socks proxy=') or + line.startswith('Support by') or + line.startswith('BTC ') or + line.startswith('IP address:Port') or + line.startswith('#')): + continue + + # Extract IP:port from lines like "89.58.55.193:80 DE-A + " + # The format is: IP:PORT COUNTRY-ANONYMITY-SSL GOOGLE_PASSED + parts = line.split() + if parts and ':' in parts[0]: + proxy = parts[0].strip() + # Validate IP:port format + if re.match(r"\d{1,3}(?:\.\d{1,3}){3}:\d{1,5}", proxy): + proxies.add(proxy) + + return "\n".join(proxies) + except Exception as e: + logger.debug(f"Error parsing spys.me format: {e}") + return "" + # From proxyscrape.com class ProxyScrapeScraper(Scraper): - """Scraper for proxyscrape.com API.""" + """Scraper for proxyscrape.com v4 API.""" - def __init__(self, method: str, timeout: int = 1000, country: str = "All"): - self.api_timeout = timeout # Renamed to avoid confusion with HTTP timeout + def __init__(self, method: str, country: str = "all"): self.country = country super().__init__(method, - "https://api.proxyscrape.com/?request=getproxies" - "&proxytype={method}" - "&timeout={api_timeout}" - "&country={country}", - timeout=20) # HTTP timeout + "https://api.proxyscrape.com/v4/free-proxy-list/get?" + "request=display_proxies&proxy_format=ipport&format=text" + "&protocol={method}&country={country}", + timeout=20) def get_url(self, **kwargs) -> str: """Get URL with API parameters.""" - return super().get_url(api_timeout=self.api_timeout, country=self.country, **kwargs) - -# From geonode.com - A little dirty, grab http(s) and socks but use just for socks -class GeoNodeScraper(Scraper): - """Scraper for geonode.com proxy API.""" - - def __init__(self, method: str, limit: str = "500", page: str = "1", - sort_by: str = "lastChecked", sort_type: str = "desc"): - self.limit = limit - self.page = page - self.sort_by = sort_by - self.sort_type = sort_type - super().__init__(method, - "https://proxylist.geonode.com/api/proxy-list?" - "&limit={limit}" - "&page={page}" - "&sort_by={sort_by}" - "&sort_type={sort_type}", - timeout=15) - - def get_url(self, **kwargs) -> str: - """Get URL with API parameters.""" - return super().get_url(limit=self.limit, page=self.page, - sort_by=self.sort_by, sort_type=self.sort_type, **kwargs) - + return super().get_url(country=self.country, **kwargs) # From proxy-list.download class ProxyListDownloadScraper(Scraper): @@ -321,29 +368,27 @@ class ProxyListApiScraper(Scraper): """Scraper for APIs that return JSON proxy lists.""" def _extract_proxy_from_item(self, item: dict) -> Optional[str]: - """Extract proxy string from a single item.""" + """Extract proxy string from a single item for new www.proxy-list.download format.""" if not isinstance(item, dict): return None - - ip = item.get('ip') - port = item.get('port') + # Support both old and new keys + ip = item.get('ip') or item.get('IP') + port = item.get('port') or item.get('PORT') if ip and port: return f"{ip}:{port}" return None - - def _process_list_data(self, data: list) -> Set[str]: - """Process list-type JSON data.""" - proxies = set() - for item in data: - proxy = self._extract_proxy_from_item(item) - if proxy: - proxies.add(proxy) - return proxies - + def _process_dict_data(self, data: dict) -> Set[str]: - """Process dict-type JSON data.""" + """Process dict-type JSON data for new www.proxy-list.download format.""" proxies = set() - if 'data' in data and isinstance(data['data'], list): + # New format: proxies are in 'LISTA' key + if 'LISTA' in data and isinstance(data['LISTA'], list): + for item in data['LISTA']: + proxy = self._extract_proxy_from_item(item) + if proxy: + proxies.add(proxy) + # Fallback for old format + elif 'data' in data and isinstance(data['data'], list): for item in data['data']: proxy = self._extract_proxy_from_item(item) if proxy: @@ -351,22 +396,41 @@ def _process_dict_data(self, data: dict) -> Set[str]: return proxies async def handle(self, response: httpx.Response) -> str: - """Parse JSON API response for proxies.""" + """Parse JSON API response for proxies (new and old format).""" try: data = response.json() proxies: Set[str] = set() - - # Handle different JSON structures - if isinstance(data, list): - proxies = self._process_list_data(data) - elif isinstance(data, dict): + if isinstance(data, dict): proxies = self._process_dict_data(data) - return "\n".join(proxies) except Exception as e: logger.debug(f"Error parsing JSON API response: {e}") return "" +# Helper functions for PlainTextScraper +def _is_protocol_match(protocol: str, method: str) -> bool: + """Check if protocol matches the scraper method.""" + return (protocol.lower() == method.lower() or + (method == "socks" and protocol.lower() in ["socks4", "socks5"])) + +def _is_valid_proxy_format(address: str) -> bool: + """Validate IP:port format.""" + return bool(re.match(r"\d{1,3}(?:\.\d{1,3}){3}:\d{1,5}", address)) + +def _process_protocol_line(line: str, method: str) -> Optional[str]: + """Process a line with protocol://ip:port format.""" + protocol, address = line.split("://", 1) + if _is_protocol_match(protocol, method): + if _is_valid_proxy_format(address): + return address + return None + +def _process_plain_line(line: str) -> Optional[str]: + """Process a plain IP:port line.""" + if _is_valid_proxy_format(line): + return line + return None + # For scraping from plain text sources class PlainTextScraper(Scraper): """Scraper for plain text proxy lists.""" @@ -381,91 +445,129 @@ async def handle(self, response: httpx.Response) -> str: line = line.strip() if not line or line.startswith('#'): continue - - # Look for IP:port pattern - if re.match(r"\d{1,3}(?:\.\d{1,3}){3}:\d{1,5}", line): - proxies.add(line) - + + # Handle protocol://ip:port format (ProxyScrape v4 API) + if "://" in line: + proxy = _process_protocol_line(line, self.method) + if proxy: + proxies.add(proxy) + else: + # Look for plain IP:port pattern (legacy format) + proxy = _process_plain_line(line) + if proxy: + proxies.add(proxy) + return "\n".join(proxies) except Exception as e: logger.debug(f"Error parsing plain text proxy list: {e}") return "" -# Improved scrapers list with better organization +# Latest and most frequently updated proxy sources (2025) scrapers = [ - # Direct API scrapers + # Primary API scrapers (most reliable) SpysMeScraper("http"), SpysMeScraper("socks"), ProxyScrapeScraper("http"), ProxyScrapeScraper("socks4"), ProxyScrapeScraper("socks5"), - GeoNodeScraper("socks"), - # Download API scrapers - ProxyListDownloadScraper("https", "elite"), - ProxyListDownloadScraper("http", "elite"), - ProxyListDownloadScraper("http", "transparent"), - ProxyListDownloadScraper("http", "anonymous"), - - # HTML table scrapers - GeneralTableScraper("https", "http://sslproxies.org"), - GeneralTableScraper("http", "http://free-proxy-list.net"), - GeneralTableScraper("http", "http://us-proxy.org"), - GeneralTableScraper("socks", "http://socks-proxy.net"), - - # HTML div scrapers - GeneralDivScraper("http", "https://freeproxy.lunaproxy.com/"), - - # GitHub raw list scrapers (established sources) - GitHubScraper("http", "https://raw.githubusercontent.com/proxifly/free-proxy-list/main/proxies/all/data.txt"), - GitHubScraper("socks4", "https://raw.githubusercontent.com/proxifly/free-proxy-list/main/proxies/all/data.txt"), - GitHubScraper("socks5", "https://raw.githubusercontent.com/proxifly/free-proxy-list/main/proxies/all/data.txt"), - GitHubScraper("http", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/all.txt"), - GitHubScraper("socks", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/all.txt"), - GitHubScraper("https", "https://raw.githubusercontent.com/zloi-user/hideip.me/main/https.txt"), - GitHubScraper("http", "https://raw.githubusercontent.com/zloi-user/hideip.me/main/http.txt"), - GitHubScraper("socks4", "https://raw.githubusercontent.com/zloi-user/hideip.me/main/socks4.txt"), - GitHubScraper("socks5", "https://raw.githubusercontent.com/zloi-user/hideip.me/main/socks5.txt"), - - # Additional GitHub sources + # TheSpeedX/PROXY-List (updated daily) GitHubScraper("http", "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt"), GitHubScraper("socks4", "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt"), GitHubScraper("socks5", "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt"), - GitHubScraper("http", "https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/http.txt"), - GitHubScraper("https", "https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/https.txt"), - GitHubScraper("socks4", "https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks4.txt"), - GitHubScraper("socks5", "https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks5.txt"), + + # jetkai/proxy-list (hourly updates, geolocation) GitHubScraper("http", "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-http.txt"), GitHubScraper("https", "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-https.txt"), GitHubScraper("socks4", "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-socks4.txt"), GitHubScraper("socks5", "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-socks5.txt"), - GitHubScraper("http", "https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt"), - GitHubScraper("http", "https://raw.githubusercontent.com/sunny9577/proxy-scraper/master/proxies.txt"), + + # prxchk/proxy-list (10 min updates, deduplicated) + GitHubScraper("http", "https://raw.githubusercontent.com/prxchk/proxy-list/main/http.txt"), + GitHubScraper("socks4", "https://raw.githubusercontent.com/prxchk/proxy-list/main/socks4.txt"), + GitHubScraper("socks5", "https://raw.githubusercontent.com/prxchk/proxy-list/main/socks5.txt"), + + # roosterkid/openproxylist (hourly updates) GitHubScraper("http", "https://raw.githubusercontent.com/roosterkid/openproxylist/main/HTTPS_RAW.txt"), GitHubScraper("socks4", "https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS4_RAW.txt"), GitHubScraper("socks5", "https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS5_RAW.txt"), + + # mmpx12/proxy-list (hourly updates) GitHubScraper("http", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/http.txt"), GitHubScraper("https", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/https.txt"), GitHubScraper("socks4", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks4.txt"), GitHubScraper("socks5", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks5.txt"), - - # Plain text sources + + + + # ProxyScrape API v4 (live, no key needed) + PlainTextScraper("http", "https://api.proxyscrape.com/v4/free-proxy-list/get?request=display_proxies&protocol=http&proxy_format=protocolipport&format=text&timeout=20000"), + PlainTextScraper("socks4", "https://api.proxyscrape.com/v4/free-proxy-list/get?request=display_proxies&protocol=socks4&proxy_format=protocolipport&format=text&timeout=20000"), + PlainTextScraper("socks5", "https://api.proxyscrape.com/v4/free-proxy-list/get?request=display_proxies&protocol=socks5&proxy_format=protocolipport&format=text&timeout=20000"), + + # OpenProxyList API (10 min updates) + PlainTextScraper("http", "https://api.openproxylist.xyz/http.txt"), + PlainTextScraper("https", "https://api.openproxylist.xyz/https.txt"), + PlainTextScraper("socks4", "https://api.openproxylist.xyz/socks4.txt"), + PlainTextScraper("socks5", "https://api.openproxylist.xyz/socks5.txt"), PlainTextScraper("http", "https://www.proxyscan.io/download?type=http"), PlainTextScraper("socks4", "https://www.proxyscan.io/download?type=socks4"), - PlainTextScraper("socks5", "https://www.proxyscan.io/download?type=socks5"), - PlainTextScraper("http", "https://raw.githubusercontent.com/almroot/proxylist/master/list.txt"), - PlainTextScraper("http", "https://raw.githubusercontent.com/aslisk/proxyhttps/main/https.txt"), - PlainTextScraper("http", "https://raw.githubusercontent.com/proxy4parsing/proxy-list/main/http.txt"), + PlainTextScraper("socks5", "https://raw.githubusercontent.com/Surfboardv2ray/Proxy-sorter/main/socks5.txt"), + + # JSON APIs + ProxyListApiScraper("http", "https://www.proxy-list.download/api/v2/get?l=en&t=http"), + ProxyListApiScraper("https", "https://www.proxy-list.download/api/v2/get?l=en&t=https"), + ProxyListApiScraper("socks4", "https://www.proxy-list.download/api/v2/get?l=en&t=socks4"), + ProxyListApiScraper("socks5", "https://www.proxy-list.download/api/v2/get?l=en&t=socks5"), + + # Fresh community sources (updated daily) + GitHubScraper("http", "https://raw.githubusercontent.com/prxchk/proxy-list/main/http.txt"), + GitHubScraper("socks4", "https://raw.githubusercontent.com/prxchk/proxy-list/main/socks4.txt"), + GitHubScraper("socks5", "https://raw.githubusercontent.com/prxchk/proxy-list/main/socks5.txt"), + + # Ultra-fresh sources (updated every few hours) + PlainTextScraper("http", "https://api.openproxylist.xyz/http.txt"), + PlainTextScraper("socks4", "https://api.openproxylist.xyz/socks4.txt"), + PlainTextScraper("socks5", "https://api.openproxylist.xyz/socks5.txt"), - # Additional table scrapers - GeneralTableScraper("http", "https://proxyspace.pro/http.txt"), - GeneralTableScraper("socks4", "https://proxyspace.pro/socks4.txt"), - GeneralTableScraper("socks5", "https://proxyspace.pro/socks5.txt"), + # Elite proxy APIs + + + # New 2025 sources + GitHubScraper("http", "https://raw.githubusercontent.com/rdavydov/proxy-list/main/proxies/http.txt"), + GitHubScraper("https", "https://raw.githubusercontent.com/rdavydov/proxy-list/main/proxies/https.txt"), + GitHubScraper("socks4", "https://raw.githubusercontent.com/rdavydov/proxy-list/main/proxies/socks4.txt"), + GitHubScraper("socks5", "https://raw.githubusercontent.com/rdavydov/proxy-list/main/proxies/socks5.txt"), + + # Quality HTML scrapers (still active) + GeneralTableScraper("https", "http://sslproxies.org"), + GeneralTableScraper("http", "http://free-proxy-list.net"), + GeneralTableScraper("http", "http://us-proxy.org"), + GeneralTableScraper("socks", "http://socks-proxy.net"), - # API-based scrapers - ProxyListApiScraper("http", "https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc&protocols=http"), - ProxyListApiScraper("socks5", "https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc&protocols=socks5"), + + GeneralTableScraper("http", "https://premproxy.com/proxy-by-country/"), + GeneralTableScraper("https", "https://premproxy.com/socks-list/"), + GeneralTableScraper("http", "https://proxyservers.pro/proxy/list/protocol/http"), + GeneralTableScraper("https", "https://proxyservers.pro/proxy/list/protocol/https"), + + # Updated HTML div scrapers + GeneralDivScraper("http", "https://freeproxy.lunaproxy.com/"), + GeneralDivScraper("http", "https://www.freeproxylists.net/"), + GeneralDivScraper("socks4", "https://www.freeproxylists.net/socks4.html"), + GeneralDivScraper("socks5", "https://www.freeproxylists.net/socks5.html"), + + # Modern proxy sites with table format + GeneralTableScraper("http", "https://hidemy.name/en/proxy-list/?type=h"), + GeneralTableScraper("https", "https://hidemy.name/en/proxy-list/?type=s"), + GeneralTableScraper("socks4", "https://hidemy.name/en/proxy-list/?type=4"), + GeneralTableScraper("socks5", "https://hidemy.name/en/proxy-list/?type=5"), + + # Additional HTML sources + GeneralTableScraper("http", "https://www.proxynova.com/proxy-server-list/"), + GeneralTableScraper("http", "https://www.proxydocker.com/en/proxylist/"), + GeneralTableScraper("https", "https://www.proxydocker.com/en/proxylist/type/https"), ] @@ -504,16 +606,8 @@ def _print_source_statistics(verbose: bool, source_stats: Dict) -> None: """Print source statistics if verbose mode is enabled.""" if not verbose: return - - print("\n*** Source Statistics ***") - print("-" * 50) - total_bad_filtered = 0 - total_invalid_filtered = 0 - for source, stats in source_stats.items(): - print(f"{source}: {stats['valid']} valid, {stats['filtered_bad']} bad IPs, {stats['filtered_invalid']} invalid") - total_bad_filtered += stats['filtered_bad'] - total_invalid_filtered += stats['filtered_invalid'] - print(f"\nTotal filtered: {total_bad_filtered} bad IPs (CDN/etc), {total_invalid_filtered} invalid format") + domain_valid, skipped, total_bad_filtered, total_invalid_filtered = _aggregate_domain_stats(source_stats) + _print_summary(domain_valid, skipped, total_bad_filtered, total_invalid_filtered) async def scrape(method: str, output: str, verbose: bool) -> None: """ @@ -538,14 +632,16 @@ async def scrape(method: str, output: str, verbose: bool) -> None: async def scrape_source(scraper, client) -> None: """Scrape from a single source.""" try: + source_id = f"{scraper.source_name}: {scraper.get_url()}" verbose_print(verbose, f"Scraping from {scraper.get_url()}...") proxies, stats = await scraper.scrape(client) all_proxies.extend(proxies) - source_stats[scraper.source_name] = stats - verbose_print(verbose, f"Found {len(proxies)} valid proxies from {scraper.source_name} ({stats['filtered_bad']} bad IPs filtered, {stats['filtered_invalid']} invalid filtered)") + source_stats[source_id] = stats + verbose_print(verbose, f"Found {len(proxies)} valid proxies from {source_id} ({stats['filtered_bad']} bad IPs filtered, {stats['filtered_invalid']} invalid filtered)") except Exception as e: - logger.debug(f"Failed to scrape from {scraper.source_name}: {e}") - source_stats[scraper.source_name] = {"total": 0, "filtered_bad": 0, "filtered_invalid": 0, "valid": 0} + source_id = f"{scraper.source_name}: {scraper.get_url()}" + logger.debug(f"Failed to scrape from {source_id}: {e}") + source_stats[source_id] = {"total": 0, "filtered_bad": 0, "filtered_invalid": 0, "valid": 0} # Execute all scrapers concurrently async with httpx.AsyncClient(**client_config) as client: @@ -578,7 +674,10 @@ def _setup_argument_parser(): Examples: %(prog)s -p http -v # Scrape HTTP proxies with verbose output %(prog)s -p socks -o socks.txt # Scrape SOCKS proxies to custom file - %(prog)s -p https --verbose # Scrape HTTPS proxies with verbose output + %(prog)s -p https --verbose # Scrape HTTPS proxies with verbose output + %(prog)s -p socks4 --debug # Scrape SOCKS4 proxies with debug logging + %(prog)s -p socks5 -o output.txt -v # Scrape SOCKS5 proxies to output.txt with verbose logging + %(prog)s -p http -o proxies.txt --debug # Scrape HTTP proxies to proxies.txt with debug logging """, ) From ebcfef7e4cbad270624bc3cef4e9bd3e64961eea Mon Sep 17 00:00:00 2001 From: Dikky Hardian <30888372+FosterG4@users.noreply.github.com> Date: Sat, 26 Jul 2025 23:26:54 +0700 Subject: [PATCH 5/5] Update .flake8 --- .flake8 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.flake8 b/.flake8 index fb8bfa7..2af47eb 100644 --- a/.flake8 +++ b/.flake8 @@ -3,6 +3,6 @@ exclude = .git,__pycache__,env,venv,.eggs,.tox,.nox,build,dist max-line-length = 120 max-complexity = 8 ignore = W,BLK, - E24,E121,E123,E126,E221,E226,E266,E704, + E24,E121,E123,E125,E126,E221,E226,E266,E704, E265,E722,E501,E731,E306,E401,E302,E222,E303, - E402,E305,E261,E262,E203,N816 \ No newline at end of file + E402,E305,E261,E262,E203,N816