Hunt down social media accounts by username across social networks
—
Site configuration and data management for loading, filtering, and organizing information about supported social networks and platforms. The site management system handles the database of over 400 supported sites with their detection methods and metadata.
Container class that holds comprehensive information about a single social media platform or website, including detection methods and testing data.
class SiteInformation:
"""
Information about a specific website/platform.
Contains all data needed to check for username existence on a particular site.
"""
def __init__(
self,
name: str,
url_home: str,
url_username_format: str,
username_claimed: str,
information: dict,
is_nsfw: bool,
username_unclaimed: str = secrets.token_urlsafe(10)
):
"""
Create Site Information Object.
Args:
name: String which identifies the site
url_home: String containing URL for home page of site
url_username_format: String containing URL format for usernames on site
(should contain "{}" placeholder for username substitution)
Example: "https://somesite.com/users/{}"
username_claimed: String containing username known to be claimed on website
information: Dictionary containing site-specific detection information
(includes custom detection methods and parameters)
is_nsfw: Boolean indicating if site is Not Safe For Work
username_unclaimed: String containing username known to be unclaimed
(defaults to secrets.token_urlsafe(10) if not provided)
"""
name: str # Site identifier name
url_home: str # Homepage URL
url_username_format: str # URL template with {} placeholder
username_claimed: str # Known claimed username for testing
username_unclaimed: str # Known unclaimed username for testing
information: dict # Site-specific detection configuration
is_nsfw: bool # Not Safe For Work flag
def __str__(self) -> str:
"""
String representation showing site name and homepage.
Returns:
Formatted string with site name and homepage URL
"""Manager class that loads and organizes information about all supported sites, with filtering and querying capabilities.
class SitesInformation:
"""
Container for information about all supported sites.
Manages the collection of site data and provides filtering and access methods.
"""
def __init__(self, data_file_path: str = None):
"""
Create Sites Information Object.
Loads site data from JSON file or URL. If no path specified, uses the
default live data from GitHub repository for most up-to-date information.
Args:
data_file_path: Path to JSON data file. Supports:
- Absolute file path: "/path/to/data.json"
- Relative file path: "data.json"
- URL: "https://example.com/data.json"
- None (default): Uses live GitHub data
Raises:
FileNotFoundError: If data file cannot be accessed
ValueError: If JSON data cannot be parsed
"""
sites: dict # Dictionary mapping site names to SiteInformation objects
def remove_nsfw_sites(self, do_not_remove: list = []):
"""
Remove NSFW (Not Safe For Work) sites from the collection.
Filters out sites marked with isNSFW flag, with optional exceptions.
Args:
do_not_remove: List of site names to keep even if marked NSFW
(case-insensitive matching)
"""
def site_name_list(self) -> list:
"""
Get sorted list of all site names.
Returns:
List of strings containing site names, sorted alphabetically (case-insensitive)
"""
def __iter__(self):
"""
Iterator over SiteInformation objects.
Yields:
SiteInformation objects for each site in the collection
"""
def __len__(self) -> int:
"""
Get number of sites in collection.
Returns:
Integer count of sites
"""Site data is stored in JSON format with the following structure for each site:
{
"SiteName": {
"urlMain": "https://example.com/",
"url": "https://example.com/user/{}",
"username_claimed": "known_user",
"errorType": "status_code",
"isNSFW": false,
"headers": {
"User-Agent": "custom-agent"
}
}
}Sites use various methods to detect username existence:
from sherlock_project.sites import SitesInformation
# Load default site data from GitHub (most up-to-date)
sites = SitesInformation()
print(f"Loaded {len(sites)} sites")
print(f"Available sites: {sites.site_name_list()[:10]}") # First 10 sites# Load from local file
sites = SitesInformation("custom_sites.json")
# Load from URL
sites = SitesInformation("https://example.com/my_sites.json")
# Load from absolute path
sites = SitesInformation("/path/to/sites.json")# Remove all NSFW sites
sites = SitesInformation()
print(f"Before filtering: {len(sites)} sites")
sites.remove_nsfw_sites()
print(f"After filtering: {len(sites)} sites")
# Keep specific NSFW sites while removing others
sites = SitesInformation()
sites.remove_nsfw_sites(do_not_remove=["Reddit", "Tumblr"])# Iterate through all sites
for site in sites:
print(f"Site: {site.name}")
print(f" Homepage: {site.url_home}")
print(f" URL Format: {site.url_username_format}")
print(f" NSFW: {site.is_nsfw}")
print(f" Test User: {site.username_claimed}")
print()
# Access specific site
github_site = sites.sites["GitHub"]
print(f"GitHub URL format: {github_site.url_username_format}")
print(f"GitHub test user: {github_site.username_claimed}")
# Check if site exists
if "Twitter" in sites.sites:
twitter_site = sites.sites["Twitter"]
print(f"Twitter detection method: {twitter_site.information.get('errorType')}")# Create subset of specific sites
social_media_sites = {
name: sites.sites[name] for name in sites.sites
if name in ["GitHub", "Twitter", "Instagram", "Facebook", "LinkedIn"]
}
print(f"Social media subset: {len(social_media_sites)} sites")
# Create subset by category
tech_sites = {}
for name, site in sites.sites.items():
if any(keyword in site.url_home.lower() for keyword in ["github", "gitlab", "stackoverflow", "dev"]):
tech_sites[name] = site
print(f"Tech-related sites: {len(tech_sites)} sites")# Analyze detection methods
detection_methods = {}
nsfw_count = 0
for site in sites:
method = site.information.get('errorType', 'unknown')
detection_methods[method] = detection_methods.get(method, 0) + 1
if site.is_nsfw:
nsfw_count += 1
print("Detection method distribution:")
for method, count in detection_methods.items():
print(f" {method}: {count} sites")
print(f"\nNSFW sites: {nsfw_count}/{len(sites)}")# Create custom site information
custom_site = SiteInformation(
name="CustomSite",
url_home="https://customsite.com/",
url_username_format="https://customsite.com/profile/{}",
username_claimed="testuser",
information={
"errorType": "status_code",
"headers": {
"User-Agent": "Custom-Bot/1.0"
}
},
is_nsfw=False
)
# Add to existing site collection
sites.sites["CustomSite"] = custom_site
print(f"Added custom site. Total sites: {len(sites)}")import json
# Export current site configuration
site_data = {}
for name, site in sites.sites.items():
site_data[name] = {
"urlMain": site.url_home,
"url": site.url_username_format,
"username_claimed": site.username_claimed,
"isNSFW": site.is_nsfw,
**site.information # Include all detection-specific data
}
# Save to file
with open("exported_sites.json", "w") as f:
json.dump(site_data, f, indent=2)
# Load and verify
test_sites = SitesInformation("exported_sites.json")
print(f"Exported and reloaded {len(test_sites)} sites")from sherlock_project.sherlock import sherlock
from sherlock_project.notify import QueryNotifyPrint
import statistics
# Test a small subset for performance analysis
test_sites = {name: sites.sites[name] for name in list(sites.sites.keys())[:20]}
notify = QueryNotifyPrint(verbose=True)
results = sherlock("testuser", test_sites, notify)
# Analyze response times
response_times = []
for site_name, result_data in results.items():
result = result_data['status']
if result.query_time:
response_times.append(result.query_time)
if response_times:
print(f"\nPerformance Analysis:")
print(f" Average response time: {statistics.mean(response_times):.3f}s")
print(f" Median response time: {statistics.median(response_times):.3f}s")
print(f" Fastest site: {min(response_times):.3f}s")
print(f" Slowest site: {max(response_times):.3f}s")Install with Tessl CLI
npx tessl i tessl/pypi-sherlock-project