-
Notifications
You must be signed in to change notification settings - Fork 1
/
ripplr.py
116 lines (91 loc) · 4.24 KB
/
ripplr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/local/bin/python
"""
ripplr.py
Purpose: A tumblr blog image scraper
Author: Kendrick Ledet
Date: 7/10/12
"""
"""
Copyright 2012 Kendrick Ledet
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import time
import urllib2
import urllib
import json
import os
""" Create a directory to store downloaded images in """
def create_path(path):
if not os.path.isdir(path):
os.makedirs(path)
""" Generate a clean blog url to use as folder name """
def clean_title(blog_url):
return blog_url.split('/')[2]
""" Download a given img link """
def download(img):
img_url_chunks = img.split('/')
filename = img_url_chunks[3] if len(img_url_chunks) < 5 else img_url_chunks[4] # get the original filename from the image link
ext = filename.split('.')[-1]
full_filename = '{}.{}'.format(filename, ext))
if os.path.exists(os.path.join(path, full_filename)): # don't overwrite existing images
print 'Already downloaded {} ...skipping'.format(full_filename)
else:
print 'Downloading {}'.format(full_filename)
urllib.urlretrieve(img, os.path.join(path, '{}.{}'.format(full_filename, ext))) # download the img
""" Get some initial input from the user """
blog_url = raw_input('Enter tumblr url\n> ')
tag_query = raw_input('Enter a specific tag to filter posts by, (just press enter for no tag filter)\n> ')
limit = int(raw_input('Enter max # of images to download (this is quite useful if you want to grab, say, the latest 50 posts, but just enter 0 to download all images)\n> '))
if blog_url[-1] == '/': # if URL has a trailing slash, remove it
blog_url = blog_url[0:-1]
""" Utilize user input to set and create download path """
title = clean_title(blog_url)
path = ('downloads/' + title + '/')
create_path(path)
start = 0 # initialize the API reading start offset to 0
download_count = 0 # initialize download count at 0
""" Begin the download loop """
print 'Downloading from %s...' % (blog_url,)
while True:
# Construct the full URL to retrieve JSON data from
full_url = '%s/api/read/json?type=photo&debug=1&start=%d&num=50&tagged=%s' % (blog_url, start, tag_query)
request = urllib2.Request(full_url) # make a Request object from it
# Send request, get JSON response
try:
response = urllib2.urlopen(request)
page = response.read()
response.close()
except urllib2.HTTPError, e:
print 'HTTP Error', e.code, '(please make sure you have entered a valid tumblr URL)'
exit(1)
json_output = json.loads(page) # parse the JSON data as a Python dict
if start == 0: # if running on first loop iteration
print 'This tumblog has', json_output['posts-total'], 'photo posts in total' # tell the user how many total posts tumblr has
if limit:
print 'but we\'re going to download a maximum of', str(limit), 'images'
else:
print 'Downloading all images'
# Download images in the current JSON response range
for post in json_output['posts']:
if post['photo-url-1280']: # if image content exists
download(post['photo-url-1280'])
download_count = download_count + 1
if limit and download_count == limit:
print 'Limit reached, images are in ./downloads/'+title+'/'
exit(1)
if post['photos']: # if post has a photoset
for photo in post['photos']: # download each image in the photoset as well
download(photo['photo-url-1280'])
download_count = download_count + 1
if limit and download_count == limit:
print 'Limit reached, images are in ./downloads/'+title+'/'
exit(1)
start = start + 50 # increment the start GET parameter by 50 for the next loop iteration
if start > int(json_output['posts-total']): # if no more posts to scrape
print 'Scraped ', json_output['posts-total'], 'photo posts'
print 'Downloads are contained in ./downloads/'+title+'/'
break