Skip to content

Advanced search filters #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 63 additions & 4 deletions src/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def check_pos_int(val: int):
else:
raise ValueError


def get_arguments(argv=sys.argv):
"""
The cli front end for the scraper.
Expand All @@ -40,11 +39,11 @@ def get_arguments(argv=sys.argv):
Returns:
parser.parse_args() -- A struct with all required info to run the scraper
"""
parser = argparse.ArgumentParser(description="Scrape google for images")
parser = argparse.ArgumentParser(description="Scrape Google for images")
parser.add_argument("keyword",
help="the phrase used to find images",
type=str,
nargs=1)
nargs="?")
parser.add_argument("-c", "--count",
help="How many images to try to scrape",
type=check_pos_int,
Expand All @@ -59,9 +58,69 @@ def get_arguments(argv=sys.argv):
type=check_pos_int,
nargs="?",
default=1)
parser.add_argument("-s", "--size",
help="Restrict your search to a certain size of image.",
type=str,
nargs="?",
default=None,
choices=["large","medium","icon", "400x300", "640x480",
"800x600", "1024x768", "2mp", "4mp", "8mp",
"10mp", "12mp", "15mp", "20mp", "40mp", "70mp"])
parser.add_argument("-a", "--aspectratio",
help="Restrict to specific aspect ratios.",
type=str,
nargs="?",
default=None,
choices=["tall", "square", "wide", "panoramic"])
parser.add_argument("-i", "--color",
help="Search for a certain color of image.",
type=str,
nargs="?",
default=None,
choices=["color", "grayscale", "transparent", "red",
"orange", "yellow", "green", "teal", "blue",
"purple", "pink", "white", "gray", "black",
"brown"])
parser.add_argument("-k", "--type",
help="The type of image to search for.",
type=str,
nargs="?",
default=None,
choices=["face", "photo", "clipart",
"lineart", "animated"],
dest="type")
parser.add_argument("-r", "--region",
help="Get results from a specific country.",
type=str,
nargs="?",
default=None)
parser.add_argument("-w", "--site",
help="Get results from a specific site or domain.",
type=str,
nargs="?",
default=None)
parser.add_argument("-f", "--filetype",
help="Search for a specific file extension.",
type=str,
nargs="?",
default=None,
choices=["jpg", "gif", "png", "bmp",
"svg", "webp", "ico", "raw"])
parser.add_argument("-u", "--usage",
help="Specify usage rights.",
type=str,
nargs="?",
default=None,
choices=["cc", "other"])
parser.add_argument("-p", "--safesearch",
help="Specify safesearch usage. Can be 'on' or 'off'.",
type=str,
nargs="?",
default="off",
choices=["on", "off"])
args = parser.parse_args(argv[1:])
# Set default directory
if args.directory is None:
print(args.keyword[0])
args.directory = get_default_dir(args.keyword[0])
args.directory = get_default_dir(args.keyword)
return args
15 changes: 14 additions & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,20 @@

def main():
args = get_arguments(sys.argv)
scrape_images(args.keyword[0], args.count, args.directory, args.threads)
print(args)
print(args.keyword)
filters = {
"size": args.size,
"aspectratio": args.aspectratio,
"color": args.color,
"type": args.type,
"region": args.region,
"site": args.site,
"filetype": args.filetype,
"usage": args.usage,
"safesearch": args.safesearch,
}
scrape_images(args.keyword, args.count, args.directory, args.threads, filters)

if __name__ == "__main__":
main()
130 changes: 127 additions & 3 deletions src/scraper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json, os, sys
from concurrent.futures import ThreadPoolExecutor
from copy import copy

import filetype
import requests
Expand Down Expand Up @@ -41,6 +42,115 @@ def add_filetype(file_path: str):
eprint(err)
return 1


def process_image_size(val: str):
# This can be refactored to use maps and eliminate redundancy (test first)
key = 'isz:'
if (val == 'large'):
return key + 'l'
elif (val == 'medium'):
return key + 'm'
elif (val == 'icon'):
return key + 'i'
elif (val in ['400x300', '640x480', '800x600', '1024x768']):
key += 'lt%2Cislt:'
if (val == '400x300'):
return key + "qsvga"
elif (val == '640x480'):
return key + "vga"
elif (val == '800x600'):
return key + "svga"
elif (val == '1024x768'):
return key + "xga"
elif (val in ['2mp','4mp','6mp','8mp','10mp',
'12mp','15mp','20mp','40mp','70mp']):
return key + 'lt%2Cislt:' + val
else:
return ""

def process_image_aspectratio(val: str):
key = 'iar:'
if (val == 'tall'):
return key + 't'
elif (val == 'square'):
return key + 's'
elif (val == 'wide'):
return key + 'w'
elif (val == 'panoramic'):
return key + 'xw'

def process_image_color(val: str):
if (val == "color"):
return "ic:color"
elif (val == "grayscale"):
return "ic:gray"
elif (val == "transparent"):
return "ic:trans"
elif (val in ['red','orange','yellow','green','teal','blue',
'purple','pink','white','gray','black','brown']):
return "ic:specific%2Cisc:" + val
else:
return ""

def process_image_type(val: str):
if (val in ['face', 'photo', 'clipart', 'lineart', 'animated']):
return 'itp:' + val
else:
return ""

def process_image_region(val: str):
if (val != None):
return 'ctr:country' + val.upper()

def process_image_filetype(val: str):
if (val in ['jpg', 'gif', 'png', 'bmp', 'svg', 'webp', 'ico', 'raw']):
return 'ift:' + val

def process_image_usage(val: str):
key = 'sur:'
if (val == 'cc'):
return key + 'cl'
elif (val == 'other'):
return key + 'ol'
else:
return ''


def setup_url(filters):
global search_url
filtered_url = copy(search_url)

features = [search_url]
url_ids = []

filter_keys = list(filters.keys())
# Need to confirm we can't put these into the tbs tag
if filters["safesearch"] == "on":
filtered_url += "&safe=on"
if filters["site"] != None:
filtered_url += ("&as_sitesearch=" + filters["site"])
filter_keys.remove("safesearch")
filter_keys.remove("site")
# if filters["region"] != None:
# filtered_url += ("&" + process_image_region(filters["region"]))
# filter_keys.remove("region")

# append_val = (lambda l, a: l.append(a) if a is not None)
def append_val(l, v):
if v is not None:
l.append(v)
for k in filter_keys:
function_name = "process_image_" + k
process_function = globals()[function_name]
append_val(url_ids, process_function(filters[k]))

delim = ","
if (url_ids[0] != []):
filtered_url += "&tbs=" + delim.join(url_ids)

return filtered_url


############################# scraping helpers ################################

def get_image_urls(query: str, page: int):
Expand Down Expand Up @@ -159,7 +269,7 @@ def get_manifest(search_key: str, image_cnt: int):

################################# main api ####################################

def scrape_images(search_key, image_cnt, directory, threads):
def scrape_images(search_key, image_cnt, directory, threads, filters):
"""
Request manifest, generate paths, save files, get filetype.
This is the only function that should be called externally.
Expand All @@ -169,12 +279,13 @@ def scrape_images(search_key, image_cnt, directory, threads):
image_cnt -- how many images are we trying to scrape
directory -- the folder to save scraped images in
threads -- how many worker threads to spawn
filters -- hashmap of image filters to apply to the search results
"""
if DEBUG:
print("savedir: {}".format(directory))
if not os.path.exists(directory):
os.makedirs(directory)

search_url = setup_url(filters)
id_url_manifest = get_manifest(search_key, image_cnt)
with ThreadPoolExecutor(max_workers=threads) as pool:
with tqdm(total=len(id_url_manifest)) as progress:
Expand Down Expand Up @@ -202,7 +313,20 @@ def test():
directory = get_default_dir(search_key)
threads = 4

scrape_images(search_key, image_cnt, directory, threads)
filters = {
"size": "large",
"aspectratio": "panoramic",
"color": "green",
"type": "clipart",
"region": "CA", # Needs a mapping of inputs to country choices
# Use an import statement and roll in another file
"site": "laksjdf",
"filetype": "png",
"usage": "other",
"safesearch": "on"
}

scrape_images(search_key, image_cnt, directory, threads, filters)


if __name__ == "__main__":
Expand Down