From 758ca1de9f69b474b24dd4ebaa2a49dce7578be9 Mon Sep 17 00:00:00 2001 From: surajpai Date: Fri, 20 Oct 2017 13:14:57 +0530 Subject: [PATCH] Add Automated Google Image Downloader #35 --- README.rst | 7 +- hackr/__init__.py | 1 + hackr/image_downloader.py | 169 ++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 - 4 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 hackr/image_downloader.py diff --git a/README.rst b/README.rst index 650a016..f481429 100644 --- a/README.rst +++ b/README.rst @@ -20,6 +20,7 @@ hackr currently supports the following features: - Send web requests (GET, POST, PUT) - Parse data in JSON - Send Emails +- Automatically Download Images for Keyword from Google Image Search hackr officially supports Python 2.7 currently. @@ -80,7 +81,11 @@ Send Emails >>> #Enter your GMAIL email address and password in the parameters email, and password. >>> hackr.actions.email("Hey there", email="foo@bar.com", password="foobar", to="bar@foo.com", subject="Foo Bar") - +Download Images from Google Search + >>> import hackr + >>> hackr.image_downloader.get_google_images(search_keyword=["foo"], keyword["high resolution"]) + + Contribution Guidelines ----------------------- diff --git a/hackr/__init__.py b/hackr/__init__.py index 64dbaa4..b0328e2 100644 --- a/hackr/__init__.py +++ b/hackr/__init__.py @@ -7,3 +7,4 @@ from . import web from . import actions from . import image +from . import image_downloader \ No newline at end of file diff --git a/hackr/image_downloader.py b/hackr/image_downloader.py new file mode 100644 index 0000000..711899d --- /dev/null +++ b/hackr/image_downloader.py @@ -0,0 +1,169 @@ + + +# coding: utf-8 + +# In[ ]: + +#Searching and Downloading Google Images/Image Links + +#Import Libraries + +#coding: UTF-8 + +import time #Importing the time library to check the time of code execution +import sys #Importing the System Library +import os +import urllib2 + + +def get_google_images(search_keyword=["Test"], keywords=["high resolution"]): + t0 = time.time() #start the timer + + #Download Image Links + i= 0 + while i" + " Item name = " + str(search_keyword[i]) + print (iteration) + print ("Evaluating...") + search_keywords = search_keyword[i] + search = search_keywords.replace(' ','%20') + + #make a search keyword directory + try: + os.makedirs(search_keywords) + except OSError, e: + if e.errno != 17: + raise + # time.sleep might help here + pass + + j = 0 + while j "+str(k+1)) + + k=k+1; + + except IOError: #If there is any IOError + + errorCount+=1 + print("IOError on image "+str(k+1)) + k=k+1; + + except HTTPError as e: #If there is any HTTPError + + errorCount+=1 + print("HTTPError"+str(k)) + k=k+1; + except URLError as e: + + errorCount+=1 + print("URLError "+str(k)) + k=k+1; + + i = i+1 + + print("\n") + print("Everything downloaded!") + print("\n"+str(errorCount)+" ----> total Errors") + + +#Downloading entire Web Document (Raw Page Content) +def download_page(url): + version = (3,0) + cur_version = sys.version_info + if cur_version >= version: #If the Current Version of Python is 3.0 or above + import urllib.request #urllib library for Extracting web pages + try: + headers = {} + headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" + req = urllib.request.Request(url, headers = headers) + resp = urllib.request.urlopen(req) + respData = str(resp.read()) + return respData + except Exception as e: + print(str(e)) + else: #If the Current Version of Python is 2.x + import urllib2 + try: + headers = {} + headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" + req = urllib2.Request(url, headers = headers) + response = urllib2.urlopen(req) + page = response.read() + return page + except: + return"Page Not found" + + +#Finding 'Next Image' from the given raw page +def _images_get_next_item(s): + start_line = s.find('rg_di') + if start_line == -1: #If no links are found then give an error! + end_quote = 0 + link = "no_links" + return link, end_quote + else: + start_line = s.find('"class="rg_meta"') + start_content = s.find('"ou"',start_line+1) + end_content = s.find(',"ow"',start_content+1) + content_raw = str(s[start_content+6:end_content-1]) + return content_raw, end_content + + +#Getting all links with the help of '_images_get_next_image' +def _images_get_all_items(page): + items = [] + while True: + item, end_content = _images_get_next_item(page) + if item == "no_links": + break + else: + items.append(item) #Append all the links in the list named 'Links' + # time.sleep(0.1) #Timer could be used to slow down the request for image downloads + page = page[end_content:] + return items + + + +if __name__ == "__main__": + get_google_images(["Ball"], ["high resolution"]) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 94bdd34..8c8b7e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,3 @@ pillow xmljson sphinx_rtd_theme matplotlib==2.0.2 -pytest