diff --git a/.flake8 b/.flake8 index 71dc6e8..2af47eb 100644 --- a/.flake8 +++ b/.flake8 @@ -1,8 +1,8 @@ [flake8] exclude = .git,__pycache__,env,venv,.eggs,.tox,.nox,build,dist -max-line-lenght = 120 +max-line-length = 120 max-complexity = 8 ignore = W,BLK, E24,E121,E123,E125,E126,E221,E226,E266,E704, E265,E722,E501,E731,E306,E401,E302,E222,E303, - E402,E305,E261,E262,E203,N816 \ No newline at end of file + E402,E305,E261,E262,E203,N816 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d7fdae8..44e36d8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -15,20 +15,20 @@ jobs: os: - ubuntu-latest python-version: - - '3.7' - '3.8' - '3.9' - '3.10' - - 'pypy-3.8' + - '3.11' + - '3.12' include: - os: windows-latest - python-version: '3.10' + python-version: '3.11' - os: macos-latest - python-version: '3.10' + python-version: '3.11' steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Install Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -41,4 +41,6 @@ jobs: - name: Run proxyScraper run: python3 proxyScraper.py -p http - name: Run proxyChecker - run: python3 proxyChecker.py -t 20 -s google.com -l output.txt \ No newline at end of file + run: python3 proxyChecker.py -t 20 -s google.com -l output.txt --limit 10 + - name: Run proxyGeolocation + run: python3 proxyGeolocation.py -i 8.8.8.8 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 0bd7639..82bacc9 100644 --- a/.gitignore +++ b/.gitignore @@ -138,4 +138,54 @@ dmypy.json # Cython debug symbols cython_debug/ -output.txt \ No newline at end of file +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be added to the global gitignore or merged into this project gitignore +.idea/ + +# VS Code +.vscode/ +*.code-workspace + +# Project specific files +.github/copilot-instructions.md +output.txt +test_small.txt +test_local.py +*.txt +!requirements.txt +!dev_requirements.txt +!user_agents.txt +!README.txt + +# Temporary files +*.tmp +*.temp +*.swp +*.swo +*~ + +# OS generated files +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# Logs +*.log +logs/ + +# Backup files +*.bak +*.backup + +# IDE files +*.sublime-project +*.sublime-workspace + +# Poetry/PDM (modern Python package managers) +poetry.lock +.pdm.toml \ No newline at end of file diff --git a/README.md b/README.md index 7e93215..f3b69b9 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,20 @@ -# Proxy Scraper and Checker +# Proxy Scraper & Checker [![Tests](https://github.com/iw4p/proxy-scraper/actions/workflows/tests.yml/badge.svg)](https://github.com/iw4p/proxy-scraper/actions/workflows/tests.yml) [![Downloads](https://static.pepy.tech/badge/proxyz)](https://pepy.tech/project/proxyz) -Scrape more than 1K HTTP - HTTPS - SOCKS4 - SOCKS5 proxies in less than 2 seconds. +**Fast, reliable proxy scraper that collects 30K+ HTTP/HTTPS/SOCKS proxies from 5+ sources in seconds.** -Scraping fresh public proxies from different sources: +✨ **Features:** +- ⚔ **Fast scraping** - All sources scraped concurrently +- šŸ›”ļø **Smart filtering** - Automatically removes CDN/bad IPs (Cloudflare, etc.) +- šŸŒ **Global coverage** - Proxies from Asia, Europe, Americas +- šŸ”§ **Easy to use** - Simple CLI interface +- āœ… **Quality checked** - Built-in proxy validation -- [sslproxies.org](http://sslproxies.org) (HTTP, HTTPS) -- [free-proxy-list.net](http://free-proxy-list.net) (HTTP, HTTPS) -- [us-proxy.org](http://us-proxy.org) (HTTP, HTTPS) -- [socks-proxy.net](http://socks-proxy.net) (Socks4, Socks5) -- [proxyscrape.com](https://proxyscrape.com) (HTTP, Socks4, Socks5) -- [proxy-list.download](https://www.proxy-list.download) (HTTP, HTTPS, Socks4, Socks5) -- [geonode.com](https://geonode.com) (HTTP, HTTPS, Socks4, Socks5) +## Installation & Setup -## Installation +### šŸ“¦ Option 1: Install from PyPI (Recommended) You can install the package directly from PyPI using `pip`: @@ -23,63 +22,276 @@ You can install the package directly from PyPI using `pip`: pip install proxyz ``` +**Verify installation:** +```bash +proxy_scraper --help +proxy_checker --help +``` + +### šŸ”§ Option 2: Install from Source Code + Alternatively, you can install dependencies manually if you're working from the source code: ```bash +# Clone the repository +git clone https://github.com/iw4p/proxy-scraper.git +cd proxy-scraper + +# Install dependencies pip3 install -r requirements.txt + +# Test the installation +python proxyScraper.py --help +python proxyChecker.py --help ``` -## Usage +### šŸ Python Requirements +- **Python 3.9+** (3.9, 3.10, 3.11, 3.12 supported) +- **Dependencies:** httpx, beautifulsoup4, pysocks + +## Quick Start Tutorial + +### Step 1: Scrape Proxies +```bash +# Get HTTP proxies (basic) +proxy_scraper -p http + +# Get HTTPS proxies +proxy_scraper -p https + +# Get SOCKS4 proxies +proxy_scraper -p socks4 + +# Get SOCKS5 proxies +proxy_scraper -p socks5 + +# Get all SOCKS proxies (SOCKS4 + SOCKS5) +proxy_scraper -p socks + +# Save to custom file (example: HTTP) +proxy_scraper -p http -o output.txt -v + +# Save HTTPS proxies with verbose output +proxy_scraper -p https -v -o output.txt -### Using the Command-Line Interface +# Save SOCKS4 proxies +proxy_scraper -p socks4 -o output.txt -Once installed via `pip`, you can use the command-line tools `proxy_scraper` and `proxy_checker` directly. +# Save SOCKS5 proxies +proxy_scraper -p socks5 -o output.txt +``` -#### For Scraping Proxies: +### Step 2: Check Proxy Quality ```bash -proxy_scraper -p http +# Test scraped HTTP proxies (basic) +proxy_checker -l output.txt -t 10 + +# Test HTTP proxies +proxy_checker -p http -l output.txt -t 10 + +# Test HTTPS proxies +proxy_checker -p https -l output.txt -t 10 + +# Test SOCKS4 proxies +proxy_checker -p socks4 -l output.txt -t 10 + +# Test SOCKS5 proxies +proxy_checker -p socks5 -l output.txt -t 10 + +# Test against specific site with verbose output +proxy_checker -l output.txt -s https://google.com -v + +# Use random user agents for testing +proxy_checker -l output.txt -r -v ``` -- With `-p` or `--proxy`, you can choose your proxy type. Supported proxy types are: **HTTP - HTTPS - Socks (Both 4 and 5) - Socks4 - Socks5**. -- With `-o` or `--output`, specify the output file name where the proxies will be saved. (Default is **output.txt**). -- With `-v` or `--verbose`, increase output verbosity. -- With `-h` or `--help`, show the help message. +### Step 3: Complete Workflow Example +```bash +# 1. Scrape HTTP proxies +proxy_scraper -p http -v -o output.txt + +# 2. Scrape HTTPS proxies +proxy_scraper -p https -v -o output.txt + +# 3. Scrape SOCKS4 proxies +proxy_scraper -p socks4 -v -o output.txt + +# 4. Scrape SOCKS5 proxies +proxy_scraper -p socks5 -v -o output.txt -#### For Checking Proxies: +# 5. Check HTTP proxies +proxy_checker -l output.txt -t 15 -v +# 6. Check HTTPS proxies +proxy_checker -l output.txt -t 15 -v + +# 7. Check SOCKS4 proxies +proxy_checker -l output.txt -t 15 -v + +# 8. Check SOCKS5 proxies +proxy_checker -l output.txt -t 15 -v + +# 9. Result: output.txt contains only working proxies (for each type) +``` + +## Supported Proxy Types +- **HTTP** - Web traffic +- **HTTPS** - Secure web traffic +- **SOCKS4** - TCP connections +- **SOCKS5** - TCP + UDP connections + +## Proxy Sources + +We collect proxies from **24 sources**: + +**🌐 Direct Websites (11 sources)** +- spys.me, free-proxy-list.net, proxyscrape.com, geonode.com +- sslproxies.org, us-proxy.org, socks-proxy.net +- proxy-list.download, proxyscan.io, proxyspace.pro +- freeproxy.lunaproxy.com, more + +**šŸ“¦ GitHub Repositories (13 sources)** +- proxifly/free-proxy-list, monosans/proxy-list, TheSpeedX/PROXY-List +- jetkai/proxy-list, roosterkid/openproxylist, mmpx12/proxy-list +- ShiftyTR/Proxy-List, clarketm/proxy-list, sunny9577/proxy-scraper +- zloi-user/hideip.me, almroot/proxylist, aslisk/proxyhttps +- proxy4parsing/proxy-list, more + +## Advanced Usage + +### CLI Options + +**Scraping:** ```bash -proxy_checker -p http -t 20 -s https://google.com -l output.txt +proxy_scraper -p [-o output.txt] [-v] + +Options: + -p, --proxy Proxy type: http, https, socks, socks4, socks5 + -o, --output Output file (default: output.txt) + -v, --verbose Show detailed statistics + -l, --list Input proxy file (default: output.txt) + -h, --help Show this help message ``` -- With `-t` or `--timeout`, set the timeout in seconds after which the proxy is considered dead. (Default is **20**). -- With `-p` or `--proxy`, check HTTPS, HTTP, SOCKS4, or SOCKS5 proxies. (Default is **HTTP**). -- With `-l` or `--list`, specify the path to your proxy list file. (Default is **output.txt**). -- With `-s` or `--site`, check proxies against a specific website like google.com. (Default is **https://google.com**). -- With `-r` or `--random_agent`, use a random user agent per proxy. -- With `-v` or `--verbose`, increase output verbosity. -- With `-h` or `--help`, show the help message. +**Checking:** +```bash +proxy_checker [-l input.txt] [-t timeout] [-s site] [-v] + +Options: + -l, --list Input proxy file (default: output.txt) + -p, --proxy Proxy type: http, https, socks, socks4, socks5 + -o, --output Output file (default: output.txt) + -t, --timeout Timeout in seconds (default: 20) + -s, --site Test site (default: https://google.com) + -r, --random_agent Use random user agents + -v, --verbose Show detailed progress + --max-threads Maximum concurrent threads (default: 10) +``` + +### From Source Code +```bash +# Clone repository +git clone https://github.com/iw4p/proxy-scraper +cd proxy-scraper + +# Install dependencies +pip install -r requirements.txt + +# Run scraper +python proxyScraper.py -p http -v + +# Check proxies +python proxyChecker.py -l output.txt -v +``` -### Running Directly from Source +## Quality & Performance -If you prefer running the scripts directly from the source code, you can use the following commands: +- āœ… **Automatic filtering** - Removes bad IPs (Cloudflare, CDNs, private ranges) +- šŸ“Š **Source statistics** - See which sources provide the best proxies +- ⚔ **Fast concurrent** - All sources scraped simultaneously -#### For Scraping: +## Example Output ```bash -python3 proxyScraper.py -p http +*** Source Statistics *** +-------------------------------------------------- +PlainTextScraper: 0 valid, 0 bad IPs, 0 invalid +GeneralTableScraper: 0 valid, 0 bad IPs, 0 invalid +ProxyScrapeScraper: 1666 valid, 334 bad IPs, 0 invalid +GitHubScraper: 0 valid, 0 bad IPs, 0 invalid +ProxyListApiScraper: 261 valid, 0 bad IPs, 0 invalid +GeneralDivScraper: 0 valid, 0 bad IPs, 0 invalid +SpysMeScraper: 400 valid, 0 bad IPs, 0 invalid + +Total filtered: 334 bad IPs (CDN/etc), 0 invalid format +Writing 37030 unique proxies to output.txt... +Scraping completed in 13.13 seconds +Found 37030 unique valid proxies ``` -#### For Checking: +## šŸŒ Proxy Geolocation & Analysis + +The project includes a powerful geolocation tool to analyze proxy origins and track sources: + +### Features +- **šŸ” IP Geolocation** - Get country, city, ISP, and organization info +- **ā˜ļø CDN Detection** - Automatically identifies Cloudflare and other CDNs +- **šŸ¢ Datacenter Detection** - Flags hosting providers and datacenters +- **šŸ“Š Source Tracking** - Maps proxies back to their original sources +- **šŸ’¾ JSON Export** - Save analysis results for further processing + +### Usage Examples + +**Analyze single IP:** +```bash +python proxyGeolocation.py -i 104.16.1.31 +``` + +**Analyze proxy file:** +```bash +python proxyGeolocation.py -f output.txt -l 50 +``` + +**Track proxy sources:** +```bash +python proxyGeolocation.py -f output.txt -s --limit 100 +``` + +**Export to JSON:** +```bash +python proxyGeolocation.py -f output.txt -o analysis.json +``` +### Sample Output ```bash -python3 proxyChecker.py -p http -t 20 -s https://google.com -l output.txt +šŸ” Proxy Geolocation Analysis Results +================================================== + +šŸ“Š Summary: +Total proxies analyzed: 50 +Proxies with geolocation data: 45 +Cloudflare proxies: 8 +Datacenter proxies: 12 + +šŸŒŽ Countries: + United States (US): 15 + Germany (DE): 8 + Singapore (SG): 6 + ... + +šŸ“‹ Detailed Results: +──────────────────────────────────────────────────────────────── +ā˜ļø 104.16.1.31:80 - San Francisco, United States | Cloudflare Inc. +šŸŒ 45.79.143.52:3128 - Tokyo, Japan | Linode LLC +šŸ¢ 159.203.61.169:3128 - New York, United States | DigitalOcean ``` ## Good to Know - Dead proxies will be removed, and only alive proxies will remain in the output file. -- This script is capable of scraping SOCKS proxies, but `proxyChecker` currently only checks HTTP(S) proxies. +- The proxy checker supports all proxy types: **HTTP, HTTPS, SOCKS4, and SOCKS5**. +- Use random user agents (`-r` flag) for better success rates when checking proxies. ## Star History diff --git a/dev_requirements.txt b/dev_requirements.txt index f09b308..2b8fd6b 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -1,8 +1,8 @@ -flake8==4.0.1 -flake8-black==0.3.1 -flake8-bugbear==22.4.25 -flake8-builtins==1.5.3 -flake8-commas==2.1.0 -flake8-isort==4.1.1 -flake8-polyfill==1.0.2 -pep8-naming==0.12.1 \ No newline at end of file +flake8>=4.0.1,<8.0.0 +flake8-black>=0.3.1,<1.0.0 +flake8-bugbear>=22.4.25,<25.0.0 +flake8-builtins>=1.5.3,<3.0.0 +flake8-commas>=2.1.0,<5.0.0 +flake8-isort>=4.1.1,<7.0.0 +flake8-polyfill>=1.0.2,<2.0.0 +pep8-naming>=0.12.1,<1.0.0 \ No newline at end of file diff --git a/proxyChecker.py b/proxyChecker.py index a5b8828..f0398b3 100644 --- a/proxyChecker.py +++ b/proxyChecker.py @@ -1,151 +1,582 @@ import argparse +import concurrent.futures +import logging import random import re import socket +import sys import threading import urllib.request +from pathlib import Path from time import time +from typing import List, Optional, Tuple import socks +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# Fallback user agents (will be extended from user_agents.txt if available) user_agents = [ - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/37.0.2062.94 Chrome/37.0.2062.94 Safari/537.36" - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", - "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9", - "Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4", - "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:133.0) Gecko/20100101 Firefox/133.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0", ] -try: - with open("user_agents.txt", "r") as f: - for line in f: - user_agents.append(line.replace("\n", "")) -except FileNotFoundError: - pass +# Load additional user agents from file if available +def load_user_agents() -> None: + """Load user agents from external file if available.""" + try: + user_agents_file = Path("user_agents.txt") + if user_agents_file.exists(): + with open(user_agents_file, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line and line not in user_agents: + user_agents.append(line) + logger.debug(f"Loaded {len(user_agents)} user agents from file") + else: + logger.debug("user_agents.txt not found, using built-in user agents") + except Exception as e: + logger.warning(f"Failed to load user agents from file: {e}") + +# Load user agents at module level +load_user_agents() class Proxy: - def __init__(self, method, proxy): - if method.lower() not in ["http", "https", "socks4", "socks5"]: - raise NotImplementedError("Only HTTP, HTTPS, SOCKS4, and SOCKS5 are supported") - self.method = method.lower() - self.proxy = proxy - - def is_valid(self): - return re.match(r"\d{1,3}(?:\.\d{1,3}){3}(?::\d{1,5})?$", self.proxy) - - def check(self, site, timeout, user_agent, verbose): - if self.method in ["socks4", "socks5"]: - socks.set_default_proxy(socks.SOCKS4 if self.method == "socks4" else socks.SOCKS5, - self.proxy.split(':')[0], int(self.proxy.split(':')[1])) + """Represents a proxy server with validation and checking capabilities.""" + + SUPPORTED_METHODS = ["http", "https", "socks4", "socks5"] + + def __init__(self, method: str, proxy: str): + """ + Initialize a proxy instance. + + Args: + method: Proxy type (http, https, socks4, socks5) + proxy: Proxy address in format 'ip:port' + + Raises: + NotImplementedError: If proxy method is not supported + ValueError: If proxy format is invalid + """ + method = method.lower().strip() + if method not in self.SUPPORTED_METHODS: + raise NotImplementedError(f"Only {', '.join(self.SUPPORTED_METHODS)} are supported, got: {method}") + + self.method = method + self.proxy = proxy.strip() + + # Validate proxy format during initialization + if not self.is_valid(): + raise ValueError(f"Invalid proxy format: {proxy}") + + def is_valid(self) -> bool: + """ + Validate proxy format (IP:port). + + Returns: + True if proxy format is valid, False otherwise + """ + if not self.proxy or ':' not in self.proxy: + return False + + try: + ip, port = self.proxy.split(':', 1) + + # Validate IP format + if not re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip): + return False + + # Validate IP range (0-255 for each octet) + ip_parts = [int(x) for x in ip.split('.')] + if not all(0 <= part <= 255 for part in ip_parts): + return False + + # Validate port range + port_num = int(port) + if not (1 <= port_num <= 65535): + return False + + return True + except (ValueError, AttributeError): + return False + + def check(self, site: str, timeout: int, user_agent: str, verbose: bool) -> Tuple[bool, float, Optional[Exception]]: + """ + Check if proxy is working by attempting to connect through it. + + Args: + site: Target website to test connection + timeout: Connection timeout in seconds + user_agent: User agent string to use + verbose: Enable verbose logging + + Returns: + Tuple of (is_valid, response_time, error) + """ + if not site.startswith(('http://', 'https://')): + site = f"https://{site}" + + start_time = time() + + try: + if self.method in ["socks4", "socks5"]: + return self._check_socks_proxy(site, timeout, verbose, start_time) + else: + return self._check_http_proxy(site, timeout, user_agent, verbose, start_time) + except Exception as e: + verbose_print(verbose, f"Proxy {self.proxy} failed with unexpected error: {e}") + return False, 0.0, e + + def _check_socks_proxy(self, site: str, timeout: int, verbose: bool, start_time: float) -> Tuple[bool, float, Optional[Exception]]: + """Check SOCKS proxy connectivity.""" + # Store original socket to restore later + original_socket = socket.socket + + try: + ip, port = self.proxy.split(':') + socks_type = socks.SOCKS4 if self.method == "socks4" else socks.SOCKS5 + + socks.set_default_proxy(socks_type, ip, int(port)) socket.socket = socks.socksocket + try: - start_time = time() - urllib.request.urlopen(site, timeout=timeout) - end_time = time() - time_taken = end_time - start_time - verbose_print(verbose, f"Proxy {self.proxy} is valid, time taken: {time_taken}") - return True, time_taken, None - except Exception as e: - verbose_print(verbose, f"Proxy {self.proxy} is not valid, error: {str(e)}") - return False, 0, e - else: - url = self.method + "://" + self.proxy - proxy_support = urllib.request.ProxyHandler({self.method: url}) - opener = urllib.request.build_opener(proxy_support) - urllib.request.install_opener(opener) - req = urllib.request.Request(self.method + "://" + site) - req.add_header("User-Agent", user_agent) - try: - start_time = time() - urllib.request.urlopen(req, timeout=timeout) + response = urllib.request.urlopen(site, timeout=timeout) + response.read(1024) # Read a small amount to ensure connection works end_time = time() time_taken = end_time - start_time - verbose_print(verbose, f"Proxy {self.proxy} is valid, time taken: {time_taken}") + + verbose_print(verbose, f"[+] Proxy {self.proxy} ({self.method.upper()}) is valid, time: {time_taken:.2f}s") return True, time_taken, None - except Exception as e: - verbose_print(verbose, f"Proxy {self.proxy} is not valid, error: {str(e)}") - return False, 0, e + + finally: + # Always restore original socket + socket.socket = original_socket + + except Exception as e: + socket.socket = original_socket # Ensure cleanup even on error + verbose_print(verbose, f"[-] Proxy {self.proxy} ({self.method.upper()}) failed: {e}") + return False, 0.0, e - def __str__(self): + def _check_http_proxy(self, site: str, timeout: int, user_agent: str, verbose: bool, start_time: float) -> Tuple[bool, float, Optional[Exception]]: + """Check HTTP/HTTPS proxy connectivity.""" + try: + proxy_url = f"{self.method}://{self.proxy}" + proxy_handler = urllib.request.ProxyHandler({ + 'http': proxy_url, + 'https': proxy_url, + }) + + opener = urllib.request.build_opener(proxy_handler) + + # Create request with proper headers + request = urllib.request.Request(site) + request.add_header("User-Agent", user_agent) + request.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + request.add_header("Accept-Language", "en-US,en;q=0.5") + request.add_header("Accept-Encoding", "gzip, deflate") + request.add_header("Connection", "keep-alive") + + response = opener.open(request, timeout=timeout) + response.read(1024) # Read a small amount to ensure connection works + + end_time = time() + time_taken = end_time - start_time + + verbose_print(verbose, f"[+] Proxy {self.proxy} ({self.method.upper()}) is valid, time: {time_taken:.2f}s") + return True, time_taken, None + + except Exception as e: + verbose_print(verbose, f"[-] Proxy {self.proxy} ({self.method.upper()}) failed: {e}") + return False, 0.0, e + + def __str__(self) -> str: + """String representation of the proxy.""" return self.proxy + def __repr__(self) -> str: + """Detailed string representation.""" + return f"Proxy(method='{self.method}', proxy='{self.proxy}')" + -def verbose_print(verbose, message): +def verbose_print(verbose: bool, message: str) -> None: + """Print message if verbose mode is enabled.""" if verbose: print(message) -def check(file, timeout, method, site, verbose, random_user_agent): +def _process_proxy_line(line: str, line_num: int, method: str) -> Optional[Proxy]: + """Process a single line from proxy file.""" + line = line.strip() + if not line or line.startswith('#'): # Skip empty lines and comments + return None + + try: + return Proxy(method, line) + except (ValueError, NotImplementedError) as e: + logger.debug(f"Line {line_num}: Invalid proxy '{line}' - {e}") + return None + + +def _read_proxy_file(file_path: str) -> List[str]: + """Read and return lines from proxy file.""" + try: + with open(file_path, "r", encoding="utf-8") as f: + return list(f) + except FileNotFoundError: + logger.error(f"Proxy file not found: {file_path}") + sys.exit(1) + except Exception as e: + logger.error(f"Error reading proxy file {file_path}: {e}") + sys.exit(1) + + +def load_proxies_from_file(file_path: str, method: str, limit: Optional[int] = None) -> List[Proxy]: + """ + Load proxies from file and create Proxy objects. + + Args: + file_path: Path to proxy list file + method: Proxy method to use + limit: Maximum number of proxies to load (None for all) + + Returns: + List of valid Proxy objects + """ proxies = [] - with open(file, "r") as f: - for line in f: - proxies.append(Proxy(method, line.replace("\n", ""))) + invalid_count = 0 + + lines = _read_proxy_file(file_path) + + for line_num, line in enumerate(lines, 1): + # Check if we've reached the limit + if limit is not None and len(proxies) >= limit: + logger.info(f"Reached limit of {limit} proxies, stopping load") + break + + proxy = _process_proxy_line(line, line_num, method) + if proxy is not None: + proxies.append(proxy) + else: + if line.strip() and not line.strip().startswith('#'): + invalid_count += 1 + + if invalid_count > 0: + logger.warning(f"Skipped {invalid_count} invalid proxy entries") + + return proxies +def save_valid_proxies(file_path: str, valid_proxies: List[Proxy]) -> None: + """ + Save valid proxies back to file. + + Args: + file_path: Output file path + valid_proxies: List of valid proxies to save + """ + try: + # Sort proxies for consistent output + sorted_proxies = sorted(valid_proxies, key=lambda p: p.proxy) + + with open(file_path, "w", encoding="utf-8") as f: + for proxy in sorted_proxies: + f.write(f"{proxy}\n") + + logger.info(f"Saved {len(valid_proxies)} valid proxies to {file_path}") + + except Exception as e: + logger.error(f"Failed to save proxies to {file_path}: {e}") + raise - print(f"Checking {len(proxies)} proxies") - proxies = filter(lambda x: x.is_valid(), proxies) - valid_proxies = [] - user_agent = random.choice(user_agents) - def check_proxy(proxy, user_agent): - new_user_agent = user_agent - if random_user_agent: - new_user_agent = random.choice(user_agents) - valid, time_taken, error = proxy.check(site, timeout, new_user_agent, verbose) - valid_proxies.extend([proxy] if valid else []) +def _prepare_checking_environment(file: str, method: str, site: str, timeout: int, random_user_agent: bool, limit: Optional[int] = None) -> Tuple[List[Proxy], str, int]: + """Prepare the environment for proxy checking.""" + print(f"Loading proxies from {file}...") + proxies = load_proxies_from_file(file, method, limit) + print(f"Loaded {len(proxies)} valid proxies for checking") + + if not proxies: + print("No valid proxies found to check") + return [], "", 0 + + # Choose base user agent + base_user_agent = random.choice(user_agents) + + # Print checking parameters + max_threads = min(len(proxies), 100) + print(f"Starting proxy validation with {max_threads} concurrent threads...") + print(f"Target site: {site}") + print(f"Timeout: {timeout}s") + print(f"Method: {method.upper()}") + print(f"User agent strategy: {'Random per proxy' if random_user_agent else 'Fixed'}") + print("-" * 60) + + return proxies, base_user_agent, max_threads - threads = [] - for proxy in proxies: - t = threading.Thread(target=check_proxy, args=(proxy, user_agent)) - threads.append(t) - for t in threads: - t.start() +def _create_proxy_checker(valid_proxies: List[Proxy], checked_count_ref: List[int], lock: threading.Lock, + site: str, timeout: int, random_user_agent: bool, base_user_agent: str, + total_proxies: int, verbose: bool): + """Create a proxy checking function with proper closure.""" + def check_single_proxy(proxy: Proxy) -> None: + """Check a single proxy and update results.""" + try: + # Select user agent + current_user_agent = random.choice(user_agents) if random_user_agent else base_user_agent + + # Check proxy + is_valid, response_time, error = proxy.check(site, timeout, current_user_agent, verbose) + + # Update results thread-safely + with lock: + checked_count_ref[0] += 1 + + if is_valid: + valid_proxies.append(proxy) + + # Progress indicator + if not verbose and checked_count_ref[0] % 50 == 0: + print(f"Progress: {checked_count_ref[0]}/{total_proxies} ({len(valid_proxies)} valid)") + + except Exception as e: + logger.debug(f"Unexpected error checking proxy {proxy}: {e}") + + return check_single_proxy - for t in threads: - t.join() - with open(file, "w") as f: - for proxy in valid_proxies: - f.write(str(proxy) + "\n") +def check(file: str, timeout: int, method: str, site: str, verbose: bool, random_user_agent: bool, limit: Optional[int] = None) -> None: + """ + Main proxy checking function. + + Args: + file: Path to proxy list file + timeout: Connection timeout in seconds + method: Proxy method to check + site: Target website for testing + verbose: Enable verbose output + random_user_agent: Use random user agent per proxy + limit: Maximum number of proxies to check + """ + start_time = time() + + # Prepare checking environment + proxies, base_user_agent, max_threads = _prepare_checking_environment( + file, method, site, timeout, random_user_agent, limit, + ) + + if not proxies: + return + + # Initialize checking state + valid_proxies = [] + checked_count_ref = [0] # Use list for mutable reference + lock = threading.Lock() + + # Create checker function + check_single_proxy = _create_proxy_checker( + valid_proxies, checked_count_ref, lock, site, timeout, + random_user_agent, base_user_agent, len(proxies), verbose, + ) + + _run_proxy_check_threadpool( + check_single_proxy, proxies, valid_proxies, checked_count_ref, file, start_time, + ) + elapsed_time = time() - start_time + # Final statistics + success_rate = (len(valid_proxies) / len(proxies)) * 100 if proxies else 0 + print("-" * 60) + print("Proxy checking completed!") + print(f"Total checked: {len(proxies)}") + print(f"Valid proxies: {len(valid_proxies)}") + print(f"Success rate: {success_rate:.1f}%") + print(f"Time taken: {elapsed_time:.2f} seconds") + print(f"Average time per proxy: {elapsed_time/len(proxies):.2f}s") + if len(valid_proxies) == 0: + print("WARNING: No working proxies found. Consider:") + print(" - Increasing timeout value") + print(" - Trying a different target site") + print(" - Using fresh proxy list") + - print(f"Found {len(valid_proxies)} valid proxies") +def _run_proxy_check_threadpool(check_single_proxy, proxies, valid_proxies, checked_count_ref, file, start_time): + """Helper to run proxy checking in a thread pool, handles KeyboardInterrupt and saving.""" + executor = None + try: + executor = concurrent.futures.ThreadPoolExecutor(max_workers=min(len(proxies), 100)) + futures = [executor.submit(check_single_proxy, proxy) for proxy in proxies] + for _ in concurrent.futures.as_completed(futures): + pass + except KeyboardInterrupt: + print("\n[!] Proxy checking cancelled by user. Stopping threads and saving progress...") + if executor is not None: + try: + executor.shutdown(wait=False, cancel_futures=True) + except Exception: + pass + save_valid_proxies(file, valid_proxies) + elapsed_time = time() - start_time + print("-" * 60) + print(f"Check cancelled. {len(valid_proxies)} valid proxies saved to {file}.") + print(f"Checked: {checked_count_ref[0]} / {len(proxies)} | Time: {elapsed_time:.2f}s") + sys.exit(130) + if executor is not None: + executor.shutdown(wait=True) + save_valid_proxies(file, valid_proxies) -def main(): - parser = argparse.ArgumentParser() +def _setup_argument_parser() -> argparse.ArgumentParser: + """Set up and configure the argument parser.""" + parser = argparse.ArgumentParser( + description="Check proxy servers for connectivity and validity", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s -p http -t 10 -v # Check HTTP proxies with 10s timeout + %(prog)s -p socks4 -l socks.txt -r # Check SOCKS4 with random user agents + %(prog)s -p https -s httpbin.org/ip --debug # Check HTTPS proxies against custom site + %(prog)s -p http --limit 50 -v # Check only the first 50 HTTP proxies + %(prog)s -p socks5 -l proxies.txt -t 30 --max-threads 20 # Check SOCKS5 proxies with 30s timeout and 20 threads +Notes: + - Dead proxies are automatically removed from the list file + - Use --debug for detailed error information + - Higher timeout values may find more working proxies but take longer + - Use --limit for quick testing or when you don't want to check all proxies + - Random user agents can help avoid detection by target sites + - Use --max-threads to control concurrency, default is 10 + """, + ) + parser.add_argument( - "-t", - "--timeout", + "-t", "--timeout", type=int, - help="Dismiss the proxy after -t seconds", default=20, + help="Connection timeout in seconds (default: %(default)s)", + ) + parser.add_argument( + "-p", "--proxy", + choices=Proxy.SUPPORTED_METHODS, + default="http", + help="Proxy type to check (default: %(default)s)", + ) + parser.add_argument( + "-l", "--list", + default="output.txt", + help="Path to proxy list file (default: %(default)s)", + ) + parser.add_argument( + "-s", "--site", + default="https://httpbin.org/ip", + help="Target website for testing (default: %(default)s)", ) - parser.add_argument("-p", "--proxy", help="Check HTTPS, HTTP, SOCKS4, or SOCKS5 proxies", default="http") - parser.add_argument("-l", "--list", help="Path to your proxy list file", default="output.txt") parser.add_argument( - "-s", - "--site", - help="Check with specific website like google.com", - default="https://google.com/", + "-v", "--verbose", + action="store_true", + help="Enable verbose output showing each proxy check", ) parser.add_argument( - "-v", - "--verbose", - help="Increase output verbosity", + "-r", "--random_agent", action="store_true", + help="Use a different random user agent for each proxy", ) parser.add_argument( - "-r", - "--random_agent", - help="Use a random user agent per proxy", + "--debug", action="store_true", + help="Enable debug logging for troubleshooting", + ) + parser.add_argument( + "--max-threads", + type=int, + default=10, + help="Maximum number of concurrent threads (default: %(default)s)", ) + parser.add_argument( + "--limit", + type=int, + help="Maximum number of proxies to check (default: check all)", + ) + + return parser + + +def _configure_logging_and_validate_args(args) -> str: + """Configure logging and validate arguments.""" + # Configure logging + if args.debug: + logging.getLogger().setLevel(logging.DEBUG) + elif args.verbose: + logging.getLogger().setLevel(logging.INFO) + else: + logging.getLogger().setLevel(logging.WARNING) + + # Validate arguments + if args.timeout <= 0: + print("Error: Timeout must be positive") + sys.exit(1) + + if args.max_threads <= 0: + print("Error: max-threads must be positive") + sys.exit(1) + + # Check if proxy file exists + if not Path(args.list).exists(): + print(f"Error: Proxy file '{args.list}' not found") + print("Tip: Run the proxy scraper first to generate a proxy list") + sys.exit(1) + + # Normalize site URL + site = args.site + if not site.startswith(('http://', 'https://')): + site = f"https://{site}" + + return site + + +def main() -> None: + """Main entry point for the proxy checker.""" + parser = _setup_argument_parser() args = parser.parse_args() - check(file=args.list, timeout=args.timeout, method=args.proxy, site=args.site, verbose=args.verbose, - random_user_agent=args.random_agent) + + # Configure logging and validate arguments + site = _configure_logging_and_validate_args(args) + + # Display startup information + print("*** Proxy Checker v2.0 ***") + print(f"Proxy file: {args.list}") + print(f"Target site: {site}") + print(f"Timeout: {args.timeout}s") + print(f"Method: {args.proxy.upper()}") + print(f"Max threads: {args.max_threads}") + if args.limit: + print(f"Limit: {args.limit} proxies") + print(f"User agents: {len(user_agents)} available") + print("=" * 60) + + try: + check( + file=args.list, + timeout=args.timeout, + method=args.proxy, + site=site, + verbose=args.verbose, + random_user_agent=args.random_agent, + limit=args.limit, + ) + + except KeyboardInterrupt: + print("\nWARNING: Operation interrupted by user") + sys.exit(1) + except Exception as e: + logger.error(f"Proxy checking failed: {e}") + if args.debug: + import traceback + traceback.print_exc() + sys.exit(1) if __name__ == "__main__": diff --git a/proxyGeolocation.py b/proxyGeolocation.py new file mode 100644 index 0000000..152d976 --- /dev/null +++ b/proxyGeolocation.py @@ -0,0 +1,504 @@ +#!/usr/bin/env python3 +""" +Proxy Geolocation and Source Tracking Tool +Identifies proxy origins and tracks which sources provide which proxies. +""" + +import argparse +import asyncio +import json +import logging +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import httpx + +# Setup logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +@dataclass +class ProxyInfo: + """Information about a proxy including its geolocation and source.""" + ip: str + port: str + country: Optional[str] = None + country_code: Optional[str] = None + city: Optional[str] = None + region: Optional[str] = None + org: Optional[str] = None + isp: Optional[str] = None + source: Optional[str] = None + is_cloudflare: bool = False + is_datacenter: bool = False + +class ProxyGeolocator: + """Main class for proxy geolocation and source tracking.""" + + def __init__(self): + self.session: Optional[httpx.AsyncClient] = None + + async def __aenter__(self): + """Async context manager entry.""" + self.session = httpx.AsyncClient( + timeout=httpx.Timeout(30.0), + limits=httpx.Limits(max_connections=10, max_keepalive_connections=5), + ) + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit.""" + if self.session: + await self.session.aclose() + + def _check_special_addresses(self, ip: str, proxy_info: ProxyInfo) -> bool: + """Check for special/reserved addresses. Returns True if special address found.""" + try: + import ipaddress + ip_obj = ipaddress.ip_address(ip) + + if str(ip_obj) == "0.0.0.0": + proxy_info.org = "Reserved: 'This host' address" + proxy_info.country = "Invalid" + return True + elif ip_obj.is_private: + proxy_info.org = "Private network address" + proxy_info.country = "Local" + return True + elif ip_obj.is_loopback: + proxy_info.org = "Loopback address" + proxy_info.country = "Local" + return True + elif ip_obj.is_reserved: + proxy_info.org = "Reserved address" + proxy_info.country = "Invalid" + return True + + return False + except Exception: + return False + + def _process_geolocation_data(self, data: dict, proxy_info: ProxyInfo) -> None: + """Process geolocation API response data.""" + if data.get("status") != "success": + return + + proxy_info.country = data.get("country") + proxy_info.country_code = data.get("countryCode") + proxy_info.city = data.get("city") + proxy_info.region = data.get("region") + proxy_info.org = data.get("org") + proxy_info.isp = data.get("isp") + + # Check if it's Cloudflare + org_lower = (data.get("org") or "").lower() + isp_lower = (data.get("isp") or "").lower() + if "cloudflare" in org_lower or "cloudflare" in isp_lower: + proxy_info.is_cloudflare = True + + # Check if it's a datacenter + datacenter_keywords = ["datacenter", "hosting", "server", "cloud", "digital ocean", "aws", "amazon", "google", "microsoft"] + if any(keyword in org_lower or keyword in isp_lower for keyword in datacenter_keywords): + proxy_info.is_datacenter = True + + async def get_ip_info(self, ip: str) -> ProxyInfo: + """Get geolocation information for an IP address.""" + proxy_info = ProxyInfo(ip=ip, port="") + + # Check for special/reserved addresses first + if self._check_special_addresses(ip, proxy_info): + return proxy_info + + try: + # Use ip-api.com for geolocation (free, no API key needed) + url = f"http://ip-api.com/json/{ip}?fields=status,message,country,countryCode,region,city,org,isp,as" + + if not self.session: + raise RuntimeError("Session not initialized") + + response = await self.session.get(url) + response.raise_for_status() + + data = response.json() + self._process_geolocation_data(data, proxy_info) + + except Exception as e: + logger.debug(f"Error getting IP info for {ip}: {e}") + + return proxy_info + + def _parse_proxy_line(self, line: str, line_num: int) -> Optional[Tuple[str, int]]: + """Parse a single proxy line. Returns None if invalid.""" + line = line.strip() + if not line or line.startswith('#'): + return None + + if ':' not in line: + return None + + try: + ip, port = line.split(':', 1) + ip = ip.strip() + port = int(port.strip()) + return (ip, port) + except ValueError: + logger.warning(f"Invalid proxy format on line {line_num}: {line}") + return None + + def _read_proxy_file_lines(self, file_path: str) -> List[str]: + """Read all lines from proxy file.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return list(f) + except FileNotFoundError: + logger.error(f"Proxy file not found: {file_path}") + return [] + except Exception as e: + logger.error(f"Error reading proxy file: {e}") + return [] + + def parse_proxy_list(self, file_path: str) -> List[Tuple[str, int]]: + """Parse proxy list file and return list of (ip, port) tuples.""" + proxies = [] + lines = self._read_proxy_file_lines(file_path) + + for line_num, line in enumerate(lines, 1): + proxy = self._parse_proxy_line(line, line_num) + if proxy is not None: + proxies.append(proxy) + + return proxies + + async def analyze_proxies(self, proxy_list: List[Tuple[str, int]], limit: Optional[int] = None) -> List[ProxyInfo]: + """Analyze a list of proxies and get their geolocation info.""" + if limit: + proxy_list = proxy_list[:limit] + + logger.info(f"šŸŒ Analyzing {len(proxy_list)} proxies for geolocation...") + + results = [] + for i, (ip, port) in enumerate(proxy_list, 1): + logger.info(f"šŸ“ Analyzing {i}/{len(proxy_list)}: {ip}:{port}") + + proxy_info = await self.get_ip_info(ip) + proxy_info.port = str(port) + results.append(proxy_info) + + # Small delay to be respectful to the API + await asyncio.sleep(0.1) + + return results + + def _calculate_summary_stats(self, results: List[ProxyInfo]) -> Tuple[Dict[str, int], int, int, int]: + """Calculate summary statistics from proxy results.""" + countries = {} + cloudflare_count = 0 + datacenter_count = 0 + valid_info_count = 0 + + for proxy in results: + if proxy.country: + valid_info_count += 1 + country_key = f"{proxy.country} ({proxy.country_code})" if proxy.country_code else proxy.country + countries[country_key] = countries.get(country_key, 0) + 1 + + if proxy.is_cloudflare: + cloudflare_count += 1 + if proxy.is_datacenter: + datacenter_count += 1 + + return countries, cloudflare_count, datacenter_count, valid_info_count + + def _print_summary_stats(self, results: List[ProxyInfo], countries: Dict[str, int], + cloudflare_count: int, datacenter_count: int, valid_info_count: int): + """Print summary statistics.""" + print("\nšŸ“Š Summary:") + print(f"Total proxies analyzed: {len(results)}") + print(f"Proxies with geolocation data: {valid_info_count}") + print(f"Cloudflare proxies: {cloudflare_count}") + print(f"Datacenter proxies: {datacenter_count}") + + if countries: + print("\nšŸŒŽ Countries:") + for country, count in sorted(countries.items(), key=lambda x: x[1], reverse=True): + print(f" {country}: {count}") + + def _format_proxy_details(self, proxy: ProxyInfo) -> str: + """Format proxy details for display.""" + flag = "šŸ”" + if proxy.is_cloudflare: + flag = "ā˜ļø" + elif proxy.is_datacenter: + flag = "šŸ¢" + elif proxy.country: + flag = "šŸŒ" + + location = "Unknown" + if proxy.city and proxy.country: + location = f"{proxy.city}, {proxy.country}" + elif proxy.country: + location = proxy.country + + org_info = "" + if proxy.org: + org_info = f" | {proxy.org}" + if proxy.isp and proxy.isp != proxy.org: + org_info += f" | ISP: {proxy.isp}" + + return f"{flag} {proxy.ip}:{proxy.port} - {location}{org_info}" + + def print_analysis_results(self, results: List[ProxyInfo], show_details: bool = True): + """Print analysis results in a formatted way.""" + if not results: + print("āŒ No proxy data to analyze") + return + + print("\nšŸ” Proxy Geolocation Analysis Results") + print("=" * 50) + + # Calculate summary statistics + countries, cloudflare_count, datacenter_count, valid_info_count = self._calculate_summary_stats(results) + + # Print summary + self._print_summary_stats(results, countries, cloudflare_count, datacenter_count, valid_info_count) + + if show_details: + print("\nšŸ“‹ Detailed Results:") + print("-" * 80) + + for proxy in results: + print(self._format_proxy_details(proxy)) + + def save_results_json(self, results: List[ProxyInfo], output_file: str): + """Save results to JSON file.""" + data = [] + for proxy in results: + data.append({ + "ip": proxy.ip, + "port": proxy.port, + "country": proxy.country, + "country_code": proxy.country_code, + "city": proxy.city, + "region": proxy.region, + "org": proxy.org, + "isp": proxy.isp, + "is_cloudflare": proxy.is_cloudflare, + "is_datacenter": proxy.is_datacenter, + "source": proxy.source, + }) + + try: + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) + print(f"šŸ’¾ Results saved to: {output_file}") + except Exception as e: + logger.error(f"Error saving results: {e}") + + async def analyze_proxy_sources(self, proxy_file: str, limit: Optional[int] = None) -> Dict[str, List[str]]: + """Analyze which source each proxy likely came from by checking current scraper results.""" + # Dynamic import to avoid circular dependency + try: + import proxyScraper + scrapers = proxyScraper.scrapers + except ImportError: + logger.warning("Could not import proxyScraper - source analysis unavailable") + return {} + + # Load proxies from file + proxies = self.parse_proxy_list(proxy_file) + if limit: + proxies = proxies[:limit] + + proxy_set = {f"{ip}:{port}" for ip, port in proxies} + source_map = {} + + logger.info(f"šŸ” Analyzing sources for {len(proxy_set)} proxies...") + + # Check each scraper + client_config = { + "follow_redirects": True, + "timeout": 30.0, + "limits": httpx.Limits(max_keepalive_connections=20, max_connections=100), + "headers": { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + }, + } + + async with httpx.AsyncClient(**client_config) as client: + for scraper in scrapers: + try: + logger.info(f"ļæ½ Checking {scraper.source_name}...") + scraped_proxies, _ = await scraper.scrape(client) + scraped_set = set(scraped_proxies) + + # Find matches + matches = proxy_set.intersection(scraped_set) + if matches: + source_map[scraper.source_name] = list(matches) + logger.info(f" Found {len(matches)} matches") + + await asyncio.sleep(0.5) # Be respectful to sources + + except Exception as e: + logger.debug(f"Error checking {scraper.source_name}: {e}") + + return source_map + + async def check_single_ip(self, ip: str) -> ProxyInfo: + """Check a single IP address.""" + logger.info(f"šŸ” Checking IP: {ip}") + return await self.get_ip_info(ip) + +def _setup_argument_parser(): + """Set up command line argument parser.""" + parser = argparse.ArgumentParser( + description="Proxy Geolocation and Source Tracking Tool", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python proxyGeolocation.py -i 104.16.1.31 + python proxyGeolocation.py -f output.txt -l 20 + python proxyGeolocation.py -f output.txt -s --limit 50 + python proxyGeolocation.py -f output.txt -o results.json + python proxyGeolocation.py -f output.txt --no-details + """, + ) + + parser.add_argument("-i", "--ip", type=str, help="Check single IP address") + parser.add_argument("-f", "--file", type=str, help="Path to proxy list file (default: output.txt)") + parser.add_argument("-s", "--sources", action="store_true", help="Analyze which sources provide which proxies") + parser.add_argument("-l", "--limit", type=int, help="Limit number of proxies to analyze") + parser.add_argument("-o", "--output", type=str, help="Save results to JSON file") + parser.add_argument("--no-details", action="store_true", help="Show only summary, no detailed results") + parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose logging") + + return parser + +async def _handle_single_ip(geolocator, args): + """Handle single IP analysis.""" + result = await geolocator.check_single_ip(args.ip) + geolocator.print_analysis_results([result], show_details=True) + + if args.output: + geolocator.save_results_json([result], args.output) + +def _validate_proxy_file(proxy_file: str) -> bool: + """Validate that proxy file exists.""" + if not Path(proxy_file).exists(): + print(f"āŒ Proxy file not found: {proxy_file}") + print("šŸ’” Run proxy scraper first: python proxyScraper.py -p http") + return False + return True + +def _print_source_summary(source_map: dict, total_mapped: int) -> None: + """Print source analysis summary.""" + print("\nšŸ” Proxy Source Analysis Results") + print("=" * 50) + print(f"Total proxies mapped to sources: {total_mapped}") + +def _print_source_details(source_map: dict, show_details: bool) -> None: + """Print detailed source information.""" + if not source_map: + return + + print("\nšŸ“Š Sources:") + for source, proxy_list in sorted(source_map.items(), key=lambda x: len(x[1]), reverse=True): + print(f" {source}: {len(proxy_list)} proxies") + if not show_details: + continue + + # Show first few proxies as examples + for proxy in proxy_list[:5]: + print(f" - {proxy}") + if len(proxy_list) > 5: + print(f" ... and {len(proxy_list) - 5} more") + print() + +async def _handle_source_analysis(geolocator, args): + """Handle proxy source analysis.""" + proxy_file = args.file or "output.txt" + + if not _validate_proxy_file(proxy_file): + return + + source_map = await geolocator.analyze_proxy_sources(proxy_file, args.limit) + total_mapped = sum(len(proxies) for proxies in source_map.values()) + + _print_source_summary(source_map, total_mapped) + _print_source_details(source_map, not args.no_details) + + if args.output: + output_data = { + "analysis_type": "source_mapping", + "total_mapped": total_mapped, + "sources": source_map, + } + try: + with open(args.output, 'w', encoding='utf-8') as f: + json.dump(output_data, f, indent=2) + print(f"šŸ’¾ Source analysis saved to: {args.output}") + except Exception as e: + logger.error(f"Error saving results: {e}") + +async def _handle_file_analysis(geolocator, args): + """Handle proxy file analysis.""" + proxy_file = args.file or "output.txt" + + if not Path(proxy_file).exists(): + print(f"āŒ Proxy file not found: {proxy_file}") + print("šŸ’” Run proxy scraper first: python proxyScraper.py -p http") + return + + proxies = geolocator.parse_proxy_list(proxy_file) + + if not proxies: + print(f"āŒ No valid proxies found in {proxy_file}") + return + + results = await geolocator.analyze_proxies(proxies, args.limit) + geolocator.print_analysis_results(results, show_details=not args.no_details) + + if args.output: + geolocator.save_results_json(results, args.output) + +def _configure_environment(args) -> None: + """Configure logging and environment settings.""" + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + # Handle Windows event loop + if sys.platform.startswith('win'): + asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) + +async def _run_analysis_based_on_args(geolocator, args): + """Run analysis based on command line arguments.""" + if args.ip: + await _handle_single_ip(geolocator, args) + elif args.sources: + await _handle_source_analysis(geolocator, args) + else: + await _handle_file_analysis(geolocator, args) + +def main(): + """Main function for CLI usage.""" + parser = _setup_argument_parser() + args = parser.parse_args() + + _configure_environment(args) + + async def run_analysis(): + async with ProxyGeolocator() as geolocator: + await _run_analysis_based_on_args(geolocator, args) + + # Run the analysis + try: + asyncio.run(run_analysis()) + except KeyboardInterrupt: + print("\nā¹ļø Analysis interrupted by user") + except Exception as e: + logger.error(f"Analysis failed: {e}") + +if __name__ == "__main__": + main() diff --git a/proxyScraper.py b/proxyScraper.py index ec00038..24a2062 100644 --- a/proxyScraper.py +++ b/proxyScraper.py @@ -1,242 +1,755 @@ import argparse import asyncio +import ipaddress +import logging import platform import re import sys import time +from typing import Dict, List, Optional, Set, Tuple +from urllib.parse import urlparse import httpx from bs4 import BeautifulSoup +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# --- Module-level helpers for source statistics --- +def _extract_domain(url): + """Extract domain from URL for statistics.""" + try: + domain = urlparse(url).netloc or urlparse('//' + url).netloc + if not domain: + domain = url + except Exception: + domain = url + return domain + +def _aggregate_domain_stats(source_stats): + """Aggregate statistics by domain.""" + total_bad_filtered = 0 + total_invalid_filtered = 0 + domain_valid = {} + skipped = 0 + for source, stats in source_stats.items(): + url = source.split(": ", 1)[-1] + domain = _extract_domain(url) + if stats['valid'] > 0: + domain_valid[domain] = domain_valid.get(domain, 0) + stats['valid'] + else: + skipped += 1 + total_bad_filtered += stats['filtered_bad'] + total_invalid_filtered += stats['filtered_invalid'] + return domain_valid, skipped, total_bad_filtered, total_invalid_filtered + +def _print_summary(domain_valid, skipped, total_bad_filtered, total_invalid_filtered): + """Print formatted statistics summary.""" + print("\n*** Source Statistics ***") + print("-" * 50) + for domain, valid_count in sorted(domain_valid.items(), key=lambda x: -x[1]): + print(f"{valid_count} valid from {domain}") + if skipped: + print(f"...{skipped} sources returned 0 valid proxies and are hidden...") + print(f"\nTotal filtered: {total_bad_filtered} bad IPs (CDN/etc), {total_invalid_filtered} invalid format") + +# Known bad IP ranges to filter out (Cloudflare, major CDNs, etc.) +BAD_IP_RANGES = [ + # Cloudflare + "173.245.48.0/20", + "103.21.244.0/22", + "103.22.200.0/22", + "103.31.4.0/22", + "141.101.64.0/18", + "108.162.192.0/18", + "190.93.240.0/20", + "188.114.96.0/20", + "197.234.240.0/22", + "198.41.128.0/17", + "162.158.0.0/15", + "104.16.0.0/13", # This includes our problematic IP 104.16.1.31 + "104.24.0.0/14", + "172.64.0.0/13", + "131.0.72.0/22", + # Amazon CloudFront + "13.32.0.0/15", + "13.35.0.0/17", + "18.160.0.0/15", + "52.222.128.0/17", + "54.182.0.0/16", + "54.192.0.0/16", + "54.230.0.0/16", + "54.239.128.0/18", + "99.86.0.0/16", + "205.251.200.0/21", + "216.137.32.0/19", +] + +def is_bad_ip(ip: str) -> bool: + """Check if an IP is in a known bad range (CDN, etc.) or is a reserved address.""" + try: + ip_obj = ipaddress.ip_address(ip) + + # Check for reserved/special addresses + if ip_obj.is_private or ip_obj.is_loopback or ip_obj.is_reserved or ip_obj.is_multicast: + return True + + # Check for specific bad addresses + if str(ip_obj) in ["0.0.0.0", "255.255.255.255", "127.0.0.1"]: + return True + + # Check against known bad ranges (CDNs) + for cidr in BAD_IP_RANGES: + if ip_obj in ipaddress.ip_network(cidr): + return True + + except (ValueError, ipaddress.AddressValueError): + return True # Invalid IP format + return False + class Scraper: + """Base scraper class for proxy sources.""" - def __init__(self, method, _url): + def __init__(self, method: str, _url: str, timeout: int = 10): self.method = method self._url = _url + self.timeout = timeout + self.source_name = self.__class__.__name__ - def get_url(self, **kwargs): + def get_url(self, **kwargs) -> str: + """Get the formatted URL for the scraper.""" return self._url.format(**kwargs, method=self.method) - async def get_response(self, client): - return await client.get(self.get_url()) + async def get_response(self, client: httpx.AsyncClient) -> httpx.Response: + """Get HTTP response from the proxy source.""" + return await client.get(self.get_url(), timeout=self.timeout) - async def handle(self, response): + async def handle(self, response: httpx.Response) -> str: + """Handle the response and extract proxy data.""" return response.text - async def scrape(self, client): - response = await self.get_response(client) - proxies = await self.handle(response) - pattern = re.compile(r"\d{1,3}(?:\.\d{1,3}){3}(?::\d{1,5})?") - return re.findall(pattern, proxies) + def filter_proxies(self, proxy_text: str) -> Tuple[Set[str], Dict[str, int]]: + """Filter proxies and return valid ones with statistics.""" + proxies = set() + stats = {"total": 0, "filtered_bad": 0, "filtered_invalid": 0, "valid": 0} + + for line in proxy_text.split('\n'): + line = line.strip() + if not line: + continue + + stats["total"] += 1 + + # Basic format validation + if ':' not in line: + stats["filtered_invalid"] += 1 + continue + + try: + ip, port = line.split(':', 1) + ip = ip.strip() + port = port.strip() + + # Validate IP format + ipaddress.ip_address(ip) + + # Validate port + port_num = int(port) + if not (1 <= port_num <= 65535): + stats["filtered_invalid"] += 1 + continue + + # Check if it's a bad IP (CDN, etc.) + if is_bad_ip(ip): + stats["filtered_bad"] += 1 + logger.debug(f"Filtered bad IP from {self.source_name}: {ip}:{port}") + continue + + proxies.add(f"{ip}:{port}") + stats["valid"] += 1 + + except (ValueError, ipaddress.AddressValueError): + stats["filtered_invalid"] += 1 + continue + + return proxies, stats + + async def scrape(self, client: httpx.AsyncClient) -> Tuple[List[str], Dict[str, int]]: + """Scrape proxies from the source.""" + try: + response = await self.get_response(client) + response.raise_for_status() # Raise an exception for bad status codes + proxy_text = await self.handle(response) + + # Use regex to find all potential proxies + pattern = re.compile(r"\d{1,3}(?:\.\d{1,3}){3}(?::\d{1,5})?") + raw_proxies = re.findall(pattern, proxy_text) + + # Filter and validate proxies + valid_proxies, stats = self.filter_proxies('\n'.join(raw_proxies)) + + return list(valid_proxies), stats + except Exception as e: + logger.debug(f"Failed to scrape from {self.source_name} ({self.get_url()}): {e}") + return [], {"total": 0, "filtered_bad": 0, "filtered_invalid": 0, "valid": 0} # From spys.me class SpysMeScraper(Scraper): + """Scraper for spys.me proxy source.""" - def __init__(self, method): - super().__init__(method, "https://spys.me/{mode}.txt") + def __init__(self, method: str): + super().__init__(method, "https://spys.me/{mode}.txt", timeout=15) - def get_url(self, **kwargs): + def get_url(self, **kwargs) -> str: + """Get URL with appropriate mode for the proxy method.""" mode = "proxy" if self.method == "http" else "socks" if self.method == "socks" else "unknown" if mode == "unknown": - raise NotImplementedError + raise NotImplementedError(f"Method {self.method} not supported by SpysMeScraper") return super().get_url(mode=mode, **kwargs) + async def handle(self, response: httpx.Response) -> str: + """Parse spys.me format to extract only IP:port.""" + try: + lines = response.text.strip().split('\n') + proxies: Set[str] = set() + + for line in lines: + line = line.strip() + if not line: + continue + + # Skip header lines and comments + if (line.startswith('Proxy list') or + line.startswith('Socks proxy=') or + line.startswith('Support by') or + line.startswith('BTC ') or + line.startswith('IP address:Port') or + line.startswith('#')): + continue + + # Extract IP:port from lines like "89.58.55.193:80 DE-A + " + # The format is: IP:PORT COUNTRY-ANONYMITY-SSL GOOGLE_PASSED + parts = line.split() + if parts and ':' in parts[0]: + proxy = parts[0].strip() + # Validate IP:port format + if re.match(r"\d{1,3}(?:\.\d{1,3}){3}:\d{1,5}", proxy): + proxies.add(proxy) + + return "\n".join(proxies) + except Exception as e: + logger.debug(f"Error parsing spys.me format: {e}") + return "" + # From proxyscrape.com class ProxyScrapeScraper(Scraper): + """Scraper for proxyscrape.com v4 API.""" - def __init__(self, method, timeout=1000, country="All"): - self.timout = timeout + def __init__(self, method: str, country: str = "all"): self.country = country super().__init__(method, - "https://api.proxyscrape.com/?request=getproxies" - "&proxytype={method}" - "&timeout={timout}" - "&country={country}") - - def get_url(self, **kwargs): - return super().get_url(timout=self.timout, country=self.country, **kwargs) - -# From geonode.com - A little dirty, grab http(s) and socks but use just for socks -class GeoNodeScraper(Scraper): - - def __init__(self, method, limit="500", page="1", sort_by="lastChecked", sort_type="desc"): - self.limit = limit - self.page = page - self.sort_by = sort_by - self.sort_type = sort_type - super().__init__(method, - "https://proxylist.geonode.com/api/proxy-list?" - "&limit={limit}" - "&page={page}" - "&sort_by={sort_by}" - "&sort_type={sort_type}") + "https://api.proxyscrape.com/v4/free-proxy-list/get?" + "request=display_proxies&proxy_format=ipport&format=text" + "&protocol={method}&country={country}", + timeout=20) - def get_url(self, **kwargs): - return super().get_url(limit=self.limit, page=self.page, sort_by=self.sort_by, sort_type=self.sort_type, **kwargs) + def get_url(self, **kwargs) -> str: + """Get URL with API parameters.""" + return super().get_url(country=self.country, **kwargs) # From proxy-list.download class ProxyListDownloadScraper(Scraper): + """Scraper for proxy-list.download API.""" - def __init__(self, method, anon): + def __init__(self, method: str, anon: str): self.anon = anon - super().__init__(method, "https://www.proxy-list.download/api/v1/get?type={method}&anon={anon}") + super().__init__(method, "https://www.proxy-list.download/api/v1/get?type={method}&anon={anon}", timeout=15) - def get_url(self, **kwargs): + def get_url(self, **kwargs) -> str: + """Get URL with anonymity level parameter.""" return super().get_url(anon=self.anon, **kwargs) # For websites using table in html class GeneralTableScraper(Scraper): + """Scraper for websites that use HTML tables to display proxies.""" - async def handle(self, response): - soup = BeautifulSoup(response.text, "html.parser") - proxies = set() - table = soup.find("table", attrs={"class": "table table-striped table-bordered"}) - for row in table.findAll("tr"): - count = 0 - proxy = "" - for cell in row.findAll("td"): - if count == 1: - proxy += ":" + cell.text.replace(" ", "") - proxies.add(proxy) - break - proxy += cell.text.replace(" ", "") - count += 1 - return "\n".join(proxies) + async def handle(self, response: httpx.Response) -> str: + """Parse HTML table to extract proxies.""" + try: + soup = BeautifulSoup(response.text, "html.parser") + proxies: Set[str] = set() + table = soup.find("table", attrs={"class": "table table-striped table-bordered"}) + + if table is None: + logger.debug("No table found with expected class") + return "" + + for row in table.find_all("tr"): + cells = row.find_all("td") + if len(cells) >= 2: + ip = cells[0].get_text(strip=True).replace(" ", "") + port = cells[1].get_text(strip=True).replace(" ", "") + if ip and port: + proxies.add(f"{ip}:{port}") + + return "\n".join(proxies) + except Exception as e: + logger.debug(f"Error parsing HTML table: {e}") + return "" # For websites using div in html class GeneralDivScraper(Scraper): + """Scraper for websites that use HTML divs to display proxies.""" - async def handle(self, response): - soup = BeautifulSoup(response.text, "html.parser") - proxies = set() - table = soup.find("div", attrs={"class": "list"}) - for row in table.findAll("div"): - count = 0 - proxy = "" - for cell in row.findAll("div", attrs={"class": "td"}): - if count == 2: - break - proxy += cell.text+":" - count += 1 - proxy = proxy.rstrip(":") - proxies.add(proxy) - return "\n".join(proxies) + async def handle(self, response: httpx.Response) -> str: + """Parse HTML divs to extract proxies.""" + try: + soup = BeautifulSoup(response.text, "html.parser") + proxies: Set[str] = set() + container = soup.find("div", attrs={"class": "list"}) + + if container is None: + logger.debug("No div found with class 'list'") + return "" + + for row in container.find_all("div"): + cells = row.find_all("div", attrs={"class": "td"}) + if len(cells) >= 2: + ip = cells[0].get_text(strip=True) + port = cells[1].get_text(strip=True) + if ip and port: + proxies.add(f"{ip}:{port}") + + return "\n".join(proxies) + except Exception as e: + logger.debug(f"Error parsing HTML divs: {e}") + return "" # For scraping live proxylist from github class GitHubScraper(Scraper): + """Scraper for GitHub raw proxy lists.""" - async def handle(self, response): - tempproxies = response.text.split("\n") + async def handle(self, response: httpx.Response) -> str: + """Parse GitHub raw proxy list format.""" + try: + temp_proxies = response.text.strip().split("\n") + proxies: Set[str] = set() + + for proxy_line in temp_proxies: + proxy_line = proxy_line.strip() + if not proxy_line: + continue + + # Handle different formats: "type://ip:port" or just "ip:port" + if self.method in proxy_line: + # Extract IP:port from lines like "http://1.2.3.4:8080" + if "//" in proxy_line: + proxy = proxy_line.split("//")[-1] + else: + proxy = proxy_line + + # Validate IP:port format + if re.match(r"\d{1,3}(?:\.\d{1,3}){3}:\d{1,5}", proxy): + proxies.add(proxy) + + return "\n".join(proxies) + except Exception as e: + logger.debug(f"Error parsing GitHub proxy list: {e}") + return "" + +# For scraping from proxy list APIs with JSON response +class ProxyListApiScraper(Scraper): + """Scraper for APIs that return JSON proxy lists.""" + + def _extract_proxy_from_item(self, item: dict) -> Optional[str]: + """Extract proxy string from a single item for new www.proxy-list.download format.""" + if not isinstance(item, dict): + return None + # Support both old and new keys + ip = item.get('ip') or item.get('IP') + port = item.get('port') or item.get('PORT') + if ip and port: + return f"{ip}:{port}" + return None + + def _process_dict_data(self, data: dict) -> Set[str]: + """Process dict-type JSON data for new www.proxy-list.download format.""" proxies = set() - for prxy in tempproxies: - if self.method in prxy: - proxies.add(prxy.split("//")[-1]) - - return "\n".join(proxies) - + # New format: proxies are in 'LISTA' key + if 'LISTA' in data and isinstance(data['LISTA'], list): + for item in data['LISTA']: + proxy = self._extract_proxy_from_item(item) + if proxy: + proxies.add(proxy) + # Fallback for old format + elif 'data' in data and isinstance(data['data'], list): + for item in data['data']: + proxy = self._extract_proxy_from_item(item) + if proxy: + proxies.add(proxy) + return proxies + async def handle(self, response: httpx.Response) -> str: + """Parse JSON API response for proxies (new and old format).""" + try: + data = response.json() + proxies: Set[str] = set() + if isinstance(data, dict): + proxies = self._process_dict_data(data) + return "\n".join(proxies) + except Exception as e: + logger.debug(f"Error parsing JSON API response: {e}") + return "" + +# Helper functions for PlainTextScraper +def _is_protocol_match(protocol: str, method: str) -> bool: + """Check if protocol matches the scraper method.""" + return (protocol.lower() == method.lower() or + (method == "socks" and protocol.lower() in ["socks4", "socks5"])) + +def _is_valid_proxy_format(address: str) -> bool: + """Validate IP:port format.""" + return bool(re.match(r"\d{1,3}(?:\.\d{1,3}){3}:\d{1,5}", address)) + +def _process_protocol_line(line: str, method: str) -> Optional[str]: + """Process a line with protocol://ip:port format.""" + protocol, address = line.split("://", 1) + if _is_protocol_match(protocol, method): + if _is_valid_proxy_format(address): + return address + return None + +def _process_plain_line(line: str) -> Optional[str]: + """Process a plain IP:port line.""" + if _is_valid_proxy_format(line): + return line + return None + +# For scraping from plain text sources +class PlainTextScraper(Scraper): + """Scraper for plain text proxy lists.""" + + async def handle(self, response: httpx.Response) -> str: + """Parse plain text proxy list.""" + try: + proxies: Set[str] = set() + lines = response.text.strip().split('\n') + + for line in lines: + line = line.strip() + if not line or line.startswith('#'): + continue + + # Handle protocol://ip:port format (ProxyScrape v4 API) + if "://" in line: + proxy = _process_protocol_line(line, self.method) + if proxy: + proxies.add(proxy) + else: + # Look for plain IP:port pattern (legacy format) + proxy = _process_plain_line(line) + if proxy: + proxies.add(proxy) + + return "\n".join(proxies) + except Exception as e: + logger.debug(f"Error parsing plain text proxy list: {e}") + return "" + + +# Latest and most frequently updated proxy sources (2025) scrapers = [ + # Primary API scrapers (most reliable) SpysMeScraper("http"), SpysMeScraper("socks"), ProxyScrapeScraper("http"), ProxyScrapeScraper("socks4"), ProxyScrapeScraper("socks5"), - GeoNodeScraper("socks"), - ProxyListDownloadScraper("https", "elite"), - ProxyListDownloadScraper("http", "elite"), - ProxyListDownloadScraper("http", "transparent"), - ProxyListDownloadScraper("http", "anonymous"), + + # TheSpeedX/PROXY-List (updated daily) + GitHubScraper("http", "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt"), + GitHubScraper("socks4", "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt"), + GitHubScraper("socks5", "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt"), + + # jetkai/proxy-list (hourly updates, geolocation) + GitHubScraper("http", "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-http.txt"), + GitHubScraper("https", "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-https.txt"), + GitHubScraper("socks4", "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-socks4.txt"), + GitHubScraper("socks5", "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-socks5.txt"), + + # prxchk/proxy-list (10 min updates, deduplicated) + GitHubScraper("http", "https://raw.githubusercontent.com/prxchk/proxy-list/main/http.txt"), + GitHubScraper("socks4", "https://raw.githubusercontent.com/prxchk/proxy-list/main/socks4.txt"), + GitHubScraper("socks5", "https://raw.githubusercontent.com/prxchk/proxy-list/main/socks5.txt"), + + # roosterkid/openproxylist (hourly updates) + GitHubScraper("http", "https://raw.githubusercontent.com/roosterkid/openproxylist/main/HTTPS_RAW.txt"), + GitHubScraper("socks4", "https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS4_RAW.txt"), + GitHubScraper("socks5", "https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS5_RAW.txt"), + + # mmpx12/proxy-list (hourly updates) + GitHubScraper("http", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/http.txt"), + GitHubScraper("https", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/https.txt"), + GitHubScraper("socks4", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks4.txt"), + GitHubScraper("socks5", "https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks5.txt"), + + + + # ProxyScrape API v4 (live, no key needed) + PlainTextScraper("http", "https://api.proxyscrape.com/v4/free-proxy-list/get?request=display_proxies&protocol=http&proxy_format=protocolipport&format=text&timeout=20000"), + PlainTextScraper("socks4", "https://api.proxyscrape.com/v4/free-proxy-list/get?request=display_proxies&protocol=socks4&proxy_format=protocolipport&format=text&timeout=20000"), + PlainTextScraper("socks5", "https://api.proxyscrape.com/v4/free-proxy-list/get?request=display_proxies&protocol=socks5&proxy_format=protocolipport&format=text&timeout=20000"), + + # OpenProxyList API (10 min updates) + PlainTextScraper("http", "https://api.openproxylist.xyz/http.txt"), + PlainTextScraper("https", "https://api.openproxylist.xyz/https.txt"), + PlainTextScraper("socks4", "https://api.openproxylist.xyz/socks4.txt"), + PlainTextScraper("socks5", "https://api.openproxylist.xyz/socks5.txt"), + PlainTextScraper("http", "https://www.proxyscan.io/download?type=http"), + PlainTextScraper("socks4", "https://www.proxyscan.io/download?type=socks4"), + PlainTextScraper("socks5", "https://raw.githubusercontent.com/Surfboardv2ray/Proxy-sorter/main/socks5.txt"), + + # JSON APIs + ProxyListApiScraper("http", "https://www.proxy-list.download/api/v2/get?l=en&t=http"), + ProxyListApiScraper("https", "https://www.proxy-list.download/api/v2/get?l=en&t=https"), + ProxyListApiScraper("socks4", "https://www.proxy-list.download/api/v2/get?l=en&t=socks4"), + ProxyListApiScraper("socks5", "https://www.proxy-list.download/api/v2/get?l=en&t=socks5"), + + # Fresh community sources (updated daily) + GitHubScraper("http", "https://raw.githubusercontent.com/prxchk/proxy-list/main/http.txt"), + GitHubScraper("socks4", "https://raw.githubusercontent.com/prxchk/proxy-list/main/socks4.txt"), + GitHubScraper("socks5", "https://raw.githubusercontent.com/prxchk/proxy-list/main/socks5.txt"), + + # Ultra-fresh sources (updated every few hours) + PlainTextScraper("http", "https://api.openproxylist.xyz/http.txt"), + PlainTextScraper("socks4", "https://api.openproxylist.xyz/socks4.txt"), + PlainTextScraper("socks5", "https://api.openproxylist.xyz/socks5.txt"), + + # Elite proxy APIs + + + # New 2025 sources + GitHubScraper("http", "https://raw.githubusercontent.com/rdavydov/proxy-list/main/proxies/http.txt"), + GitHubScraper("https", "https://raw.githubusercontent.com/rdavydov/proxy-list/main/proxies/https.txt"), + GitHubScraper("socks4", "https://raw.githubusercontent.com/rdavydov/proxy-list/main/proxies/socks4.txt"), + GitHubScraper("socks5", "https://raw.githubusercontent.com/rdavydov/proxy-list/main/proxies/socks5.txt"), + + # Quality HTML scrapers (still active) GeneralTableScraper("https", "http://sslproxies.org"), GeneralTableScraper("http", "http://free-proxy-list.net"), GeneralTableScraper("http", "http://us-proxy.org"), GeneralTableScraper("socks", "http://socks-proxy.net"), + + + GeneralTableScraper("http", "https://premproxy.com/proxy-by-country/"), + GeneralTableScraper("https", "https://premproxy.com/socks-list/"), + GeneralTableScraper("http", "https://proxyservers.pro/proxy/list/protocol/http"), + GeneralTableScraper("https", "https://proxyservers.pro/proxy/list/protocol/https"), + + # Updated HTML div scrapers GeneralDivScraper("http", "https://freeproxy.lunaproxy.com/"), - GitHubScraper("http", "https://raw.githubusercontent.com/proxifly/free-proxy-list/main/proxies/all/data.txt"), - GitHubScraper("socks4", "https://raw.githubusercontent.com/proxifly/free-proxy-list/main/proxies/all/data.txt"), - GitHubScraper("socks5", "https://raw.githubusercontent.com/proxifly/free-proxy-list/main/proxies/all/data.txt"), - GitHubScraper("http", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/all.txt"), - GitHubScraper("socks", "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/all.txt"), - GitHubScraper("https", "https://raw.githubusercontent.com/zloi-user/hideip.me/main/https.txt"), - GitHubScraper("http", "https://raw.githubusercontent.com/zloi-user/hideip.me/main/http.txt"), - GitHubScraper("socks4", "https://raw.githubusercontent.com/zloi-user/hideip.me/main/socks4.txt"), - GitHubScraper("socks5", "https://raw.githubusercontent.com/zloi-user/hideip.me/main/socks5.txt"), + GeneralDivScraper("http", "https://www.freeproxylists.net/"), + GeneralDivScraper("socks4", "https://www.freeproxylists.net/socks4.html"), + GeneralDivScraper("socks5", "https://www.freeproxylists.net/socks5.html"), + + # Modern proxy sites with table format + GeneralTableScraper("http", "https://hidemy.name/en/proxy-list/?type=h"), + GeneralTableScraper("https", "https://hidemy.name/en/proxy-list/?type=s"), + GeneralTableScraper("socks4", "https://hidemy.name/en/proxy-list/?type=4"), + GeneralTableScraper("socks5", "https://hidemy.name/en/proxy-list/?type=5"), + + # Additional HTML sources + GeneralTableScraper("http", "https://www.proxynova.com/proxy-server-list/"), + GeneralTableScraper("http", "https://www.proxydocker.com/en/proxylist/"), + GeneralTableScraper("https", "https://www.proxydocker.com/en/proxylist/type/https"), ] -def verbose_print(verbose, message): + +def verbose_print(verbose: bool, message: str) -> None: + """Print message if verbose mode is enabled.""" if verbose: print(message) -async def scrape(method, output, verbose): - now = time.time() + +def _determine_scraping_methods(method: str) -> List[str]: + """Determine which methods to scrape based on input.""" methods = [method] if method == "socks": - methods += ["socks4", "socks5"] + methods.extend(["socks4", "socks5"]) + return methods + +def _get_scrapers_for_methods(methods: List[str]) -> List: + """Get scrapers that match the specified methods.""" proxy_scrapers = [s for s in scrapers if s.method in methods] if not proxy_scrapers: - raise ValueError("Method not supported") - verbose_print(verbose, "Scraping proxies...") - proxies = [] - - tasks = [] - client = httpx.AsyncClient(follow_redirects=True) + raise ValueError(f"Methods '{methods}' not supported") + return proxy_scrapers + +def _create_http_client_config() -> Dict: + """Create HTTP client configuration.""" + return { + "follow_redirects": True, + "timeout": 30.0, + "limits": httpx.Limits(max_keepalive_connections=20, max_connections=100), + "headers": { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + }, + } + +def _print_source_statistics(verbose: bool, source_stats: Dict) -> None: + """Print source statistics if verbose mode is enabled.""" + if not verbose: + return + domain_valid, skipped, total_bad_filtered, total_invalid_filtered = _aggregate_domain_stats(source_stats) + _print_summary(domain_valid, skipped, total_bad_filtered, total_invalid_filtered) + +async def scrape(method: str, output: str, verbose: bool) -> None: + """ + Main scraping function that coordinates all scrapers. + + Args: + method: Proxy type to scrape (http, https, socks, socks4, socks5) + output: Output file path + verbose: Enable verbose logging + """ + start_time = time.time() + + # Setup scraping parameters + methods = _determine_scraping_methods(method) + proxy_scrapers = _get_scrapers_for_methods(methods) + client_config = _create_http_client_config() + + verbose_print(verbose, f"Scraping proxies using {len(proxy_scrapers)} sources...") + all_proxies: List[str] = [] + source_stats: Dict[str, Dict[str, int]] = {} - async def scrape_scraper(scraper): + async def scrape_source(scraper, client) -> None: + """Scrape from a single source.""" try: - verbose_print(verbose, f"Looking {scraper.get_url()}...") - proxies.extend(await scraper.scrape(client)) - except Exception: - pass - - for scraper in proxy_scrapers: - tasks.append(asyncio.ensure_future(scrape_scraper(scraper))) - - await asyncio.gather(*tasks) - await client.aclose() - - proxies = set(proxies) - verbose_print(verbose, f"Writing {len(proxies)} proxies to file...") - with open(output, "w") as f: - f.write("\n".join(proxies)) - verbose_print(verbose, "Done!") - verbose_print(verbose, f"Took {time.time() - now} seconds") - -def main(): - parser = argparse.ArgumentParser() + source_id = f"{scraper.source_name}: {scraper.get_url()}" + verbose_print(verbose, f"Scraping from {scraper.get_url()}...") + proxies, stats = await scraper.scrape(client) + all_proxies.extend(proxies) + source_stats[source_id] = stats + verbose_print(verbose, f"Found {len(proxies)} valid proxies from {source_id} ({stats['filtered_bad']} bad IPs filtered, {stats['filtered_invalid']} invalid filtered)") + except Exception as e: + source_id = f"{scraper.source_name}: {scraper.get_url()}" + logger.debug(f"Failed to scrape from {source_id}: {e}") + source_stats[source_id] = {"total": 0, "filtered_bad": 0, "filtered_invalid": 0, "valid": 0} + + # Execute all scrapers concurrently + async with httpx.AsyncClient(**client_config) as client: + tasks = [scrape_source(scraper, client) for scraper in proxy_scrapers] + await asyncio.gather(*tasks, return_exceptions=True) + + # Process results + unique_proxies: Set[str] = set(all_proxies) + _print_source_statistics(verbose, source_stats) + + # Write results to file + verbose_print(verbose, f"Writing {len(unique_proxies)} unique proxies to {output}...") + try: + with open(output, "w", encoding="utf-8") as f: + f.write("\n".join(sorted(unique_proxies)) + "\n") + except IOError as e: + logger.error(f"Failed to write to output file {output}: {e}") + raise + + elapsed_time = time.time() - start_time + verbose_print(verbose, f"Scraping completed in {elapsed_time:.2f} seconds") + verbose_print(verbose, f"Found {len(unique_proxies)} unique valid proxies") + +def _setup_argument_parser(): + """Set up and return the argument parser.""" + parser = argparse.ArgumentParser( + description="Scrape proxies from multiple sources", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s -p http -v # Scrape HTTP proxies with verbose output + %(prog)s -p socks -o socks.txt # Scrape SOCKS proxies to custom file + %(prog)s -p https --verbose # Scrape HTTPS proxies with verbose output + %(prog)s -p socks4 --debug # Scrape SOCKS4 proxies with debug logging + %(prog)s -p socks5 -o output.txt -v # Scrape SOCKS5 proxies to output.txt with verbose logging + %(prog)s -p http -o proxies.txt --debug # Scrape HTTP proxies to proxies.txt with debug logging + """, + ) + + supported_methods = sorted(set(s.method for s in scrapers)) + parser.add_argument( - "-p", - "--proxy", - help="Supported proxy type: " + ", ".join(sorted(set([s.method for s in scrapers]))), + "-p", "--proxy", required=True, + choices=supported_methods, + help=f"Proxy type to scrape. Supported types: {', '.join(supported_methods)}", ) parser.add_argument( - "-o", - "--output", - help="Output file name to save .txt file", + "-o", "--output", default="output.txt", + help="Output file name to save proxies (default: %(default)s)", ) parser.add_argument( - "-v", - "--verbose", - help="Increase output verbosity", + "-v", "--verbose", action="store_true", + help="Enable verbose output", ) - args = parser.parse_args() - - if sys.version_info >= (3, 7) and platform.system() == 'Windows': - loop = asyncio.get_event_loop() - loop.run_until_complete(scrape(args.proxy, args.output, args.verbose)) - loop.close() - elif sys.version_info >= (3, 7): + parser.add_argument( + "--debug", + action="store_true", + help="Enable debug logging", + ) + + return parser + +def _configure_logging(args): + """Configure logging based on command line arguments.""" + if args.debug: + logging.getLogger().setLevel(logging.DEBUG) + elif args.verbose: + logging.getLogger().setLevel(logging.INFO) + else: + logging.getLogger().setLevel(logging.WARNING) + +def _run_scraping(args): + """Run the scraping process with appropriate event loop handling.""" + if sys.version_info >= (3, 7): + if platform.system() == 'Windows': + # Windows-specific asyncio policy for better compatibility + asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) asyncio.run(scrape(args.proxy, args.output, args.verbose)) else: + # Fallback for Python < 3.7 loop = asyncio.get_event_loop() - loop.run_until_complete(scrape(args.proxy, args.output, args.verbose)) - loop.close() + try: + loop.run_until_complete(scrape(args.proxy, args.output, args.verbose)) + finally: + loop.close() + +def main() -> None: + """Main entry point for the proxy scraper.""" + parser = _setup_argument_parser() + args = parser.parse_args() + + _configure_logging(args) + + try: + _run_scraping(args) + except KeyboardInterrupt: + print("\nScraping interrupted by user") + sys.exit(1) + except Exception as e: + logger.error(f"Scraping failed: {e}") + if args.debug: + import traceback + traceback.print_exc() + sys.exit(1) + if __name__ == "__main__": main() diff --git a/requirements.txt b/requirements.txt index 97b770a..c6a4ddd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ -beautifulsoup4==4.11.1 -requests==2.27.1 -colorama==0.4.4 -urllib3==1.26.9 -httpx -socks -PySocks \ No newline at end of file +beautifulsoup4>=4.11.1,<5.0.0 +requests>=2.27.1,<3.0.0 +colorama>=0.4.4,<1.0.0 +urllib3>=1.26.9,<3.0.0 +httpx>=0.23.0,<1.0.0 +PySocks>=1.7.1,<2.0.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 575d218..4f12769 100644 --- a/setup.py +++ b/setup.py @@ -2,12 +2,12 @@ setup( name='proxyz', - version='0.2.0', + version='0.3.0', py_modules=['proxyScraper', 'proxyChecker'], install_requires=[ - 'httpx', - 'beautifulsoup4', - 'pysocks', + 'httpx>=0.23.0,<1.0.0', + 'beautifulsoup4>=4.11.1,<5.0.0', + 'pysocks>=1.7.1,<2.0.0', ], entry_points={ 'console_scripts': [ @@ -21,14 +21,18 @@ }, author='Nima Akbarzadeh', author_email='iw4p@protonmail.com', - description='scrape proxies from more than 5 different sources and check which ones are still alive', + description='scrape proxies from more than 12 different sources and check which ones are still alive', long_description=open('README.md').read(), long_description_content_type='text/markdown', url='https://github.com/iw4p/proxy-scraper', classifiers=[ 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', ], - python_requires='>=3.7', + python_requires='>=3.9', ) diff --git a/user_agents.txt b/user_agents.txt index ae82bd5..b9e20a0 100644 --- a/user_agents.txt +++ b/user_agents.txt @@ -1,3 +1,40 @@ +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 +Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:133.0) Gecko/20100101 Firefox/133.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:132.0) Gecko/20100101 Firefox/132.0 +Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0 +Mozilla/5.0 (X11; Linux x86_64; rv:132.0) Gecko/20100101 Firefox/132.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15 +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 OPR/117.0.0.0 +Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 OPR/116.0.0.0 +Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 OPR/117.0.0.0 +Mozilla/5.0 (iPhone; CPU iPhone OS 18_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Mobile/15E148 Safari/604.1 +Mozilla/5.0 (iPhone; CPU iPhone OS 18_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Mobile/15E148 Safari/604.1 +Mozilla/5.0 (iPad; CPU OS 18_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Mobile/15E148 Safari/604.1 +Mozilla/5.0 (iPad; CPU OS 18_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Mobile/15E148 Safari/604.1 +Mozilla/5.0 (iPhone; CPU iPhone OS 18_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/131.0.6778.73 Mobile/15E148 Safari/604.1 +Mozilla/5.0 (iPhone; CPU iPhone OS 18_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/130.0.6723.90 Mobile/15E148 Safari/604.1 +Mozilla/5.0 (Android 14; Mobile; rv:133.0) Gecko/133.0 Firefox/133.0 +Mozilla/5.0 (Android 13; Mobile; rv:132.0) Gecko/132.0 Firefox/132.0 +Mozilla/5.0 (Linux; Android 14; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36 +Mozilla/5.0 (Linux; Android 13; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Mobile Safari/537.36 +Mozilla/5.0 (Linux; Android 14; Pixel 8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36 +Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Mobile Safari/537.36 +Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 +Mozilla/5.0 (Windows NT 11.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0 +Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0 +Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0 +Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:132.0) Gecko/20100101 Firefox/132.0 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/37.0.2062.94 Chrome/37.0.2062.94 Safari/537.36 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36 Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko