fix up options and write simplified json

This commit is contained in:
Jules Laplace
2015-07-11 00:41:20 -04:00
parent 5de59e3e60
commit c4d9ffa024
2 changed files with 86 additions and 25 deletions

View File

@@ -3,7 +3,8 @@ ytmnd
ytmnd scraper. ytmnd scraper.
`python ./ytmnd.py [--media-only] [--no-web-audio] [--json] [-u username] [domain]` `./ytmnd.py -u _username_`
`./ytmnd.py domain`
serving serving
------- -------
@@ -18,8 +19,10 @@ options
| flag | description | | flag | description |
| -------------- | ----------------------- | | -------------- | ----------------------- |
| `--media-only` | only download the gif and mp3 | | `--media-only` | only download the gif and mp3 |
| `--html-only` | only write an html file|
| `--json-only` | writes simplified json to a file |
| `--no-web-audio` | uses the <audio> tag instead of web audio | | `--no-web-audio` | uses the <audio> tag instead of web audio |
| `--json` | dumps json for the ytmnd to stdout | | `--print-json` | dumps raw json from ytmnd to stdout |
| `--user` (or `-u`) | fetch all ytmnds for a user | | `--user` (or `-u`) | fetch all ytmnds for a user |
license license

104
ytmnd.py
View File

@@ -4,6 +4,7 @@ import sys
import os import os
import os.path import os.path
import re import re
import time
import urllib2 import urllib2
import simplejson import simplejson
from optparse import OptionParser from optparse import OptionParser
@@ -11,9 +12,13 @@ from optparse import OptionParser
class YTMND: class YTMND:
def __init__ (self): def __init__ (self):
self.user_mode = False
self.media_only = False self.media_only = False
self.html_only = False
self.json_only = False
self.no_web_audio = False self.no_web_audio = False
self.json = False self.print_json = False
self.sleep = 5
# Scrapes sites from the profile page, then fetches them # Scrapes sites from the profile page, then fetches them
def fetch_user(self, user): def fetch_user(self, user):
@@ -28,19 +33,25 @@ class YTMND:
for line in ytmnd_html: for line in ytmnd_html:
if 'profile_link' in line: if 'profile_link' in line:
expr = r"site_link\" href=\"http://(\S+).ytmn(d|sfw)?.com\"" expr = r"site_link\" href=\"http://(\S+).ytmn(d|sfw)?.com\""
domain = re.search(expr,line).group(1) domain = re.search(expr,line).group(1)
domains.append(domain) domains.append(domain)
print ">> found %d domains" % len( domains ) if self.json_only:
os.system("mkdir -p %s" % user) parsed = []
os.chdir(user) for domain in domains:
if not self.no_web_audio: parsed.append( self.fetch_ytmnd( domain ) )
self.copy_ytmnd_js() self.write_json(ytmnd_name, parsed)
for domain in domains:
ytmnd.fetch_ytmnd( domain ) else:
os.chdir("..") print ">> found %d domains" % len( domains )
os.system("mkdir -p %s" % user)
os.chdir(user)
if not self.no_web_audio:
self.copy_ytmnd_js()
for domain in domains:
self.fetch_ytmnd( domain )
os.chdir("..")
# Fetches a single subdomain # Fetches a single subdomain
def fetch_ytmnd(self, domain): def fetch_ytmnd(self, domain):
@@ -49,7 +60,10 @@ class YTMND:
print("expecting one ytmnd name, got "+str(sys.argv)) print("expecting one ytmnd name, got "+str(sys.argv))
return return
print "fetching %s" % domain if not self.print_json:
print "fetching %s" % domain
if not self.sleep:
time.sleep(self.sleep)
ytmnd_name = domain ytmnd_name = domain
ytmnd_html = urllib2.urlopen("http://" + domain + ".ytmnd.com").read() ytmnd_html = urllib2.urlopen("http://" + domain + ".ytmnd.com").read()
@@ -57,13 +71,19 @@ class YTMND:
ytmnd_id = re.search(expr,ytmnd_html).group(1) ytmnd_id = re.search(expr,ytmnd_html).group(1)
ytmnd_info = simplejson.load(urllib2.urlopen("http://" + domain + ".ytmnd.com/info/" + ytmnd_id + "/json")) ytmnd_info = simplejson.load(urllib2.urlopen("http://" + domain + ".ytmnd.com/info/" + ytmnd_id + "/json"))
if ytmnd.json: if self.print_json:
print simplejson.dumps(ytmnd_info, sort_keys=True, indent=4 * ' ') print simplejson.dumps(ytmnd_info, sort_keys=True, indent=4 * ' ')
# ytmnd.write_json(ytmnd_info) elif self.json_only:
return self.parse_json(ytmnd_info)
elif self.media_only:
self.fetch_media(ytmnd_info)
elif self.html_only:
self.write_index(ytmnd_info)
else: else:
ytmnd.fetch_media(ytmnd_info) self.fetch_media(ytmnd_info)
if not ytmnd.media_only: self.write_index(ytmnd_info)
ytmnd.write_index(ytmnd_info)
return ytmnd_info
# Fetches the gif and mp3 for a post # Fetches the gif and mp3 for a post
def fetch_media(self, ytmnd_info): def fetch_media(self, ytmnd_info):
@@ -126,7 +146,7 @@ class YTMND:
self.write_zoom_text(fn, ytmnd_info) self.write_zoom_text(fn, ytmnd_info)
if self.no_web_audio: if self.no_web_audio:
fn.write("<audio src=%s.mp3 loop autoplay>\n" % domain) fn.write("<audio src='%s.%s' loop autoplay>\n" % (domain, wav_type))
fn.write("</body>\n") fn.write("</body>\n")
else: else:
fn.write("</body>\n") fn.write("</body>\n")
@@ -180,12 +200,43 @@ class YTMND:
if not os.path.isfile("ytmnd.js"): if not os.path.isfile("ytmnd.js"):
os.system("cp ../ytmnd.js .") os.system("cp ../ytmnd.js .")
# Writes site JSON to a file # Parses data we need out of JSON
def write_json (self, ytmnd_info): def parse_json (self, ytmnd_info):
domain = ytmnd_info['site']['domain'] domain = ytmnd_info['site']['domain']
bgcolor = ytmnd_info['site']['background']['color']
title = ytmnd_info['site']['description']
placement = ytmnd_info['site']['foreground']['placement']
gif_type = ytmnd_info['site']['foreground']['url'].split(".")[-1]
wav_type = ytmnd_info['site']['sound']['type']
zoom_text = ytmnd_info['site']['zoom_text']
if len(zoom_text['line_1']) == 0:
zoom_text = ""
if 'alternates' in ytmnd_info['site']['sound']:
key = ytmnd_info['site']['sound']['alternates'].keys()[0]
value = ytmnd_info['site']['sound']['alternates'][key]
if value['file_type'] != 'swf':
wav_type = ytmnd_info['site']['sound']['file_type']
simplified_info = {
'domain': domain,
'bgcolor': bgcolor,
'title': title,
'placement': placement,
'gif': domain + "." + gif_type,
'wav': domain + "." + wav_type,
'gif_type': gif_type,
'wav_type': wav_type,
'zoom_text': zoom_text,
}
return simplified_info
# Writes site JSON to a file
def write_json (self, domain, data):
fn = open(domain + '.json', 'w') fn = open(domain + '.json', 'w')
fn.write( simplejson.dumps(ytmnd_info) ) fn.write( simplejson.dumps(data) )
fn.close() fn.close()
if __name__ == '__main__': if __name__ == '__main__':
@@ -194,19 +245,26 @@ if __name__ == '__main__':
parser.add_option("-u", "--user", action="store_true") parser.add_option("-u", "--user", action="store_true")
parser.add_option("-m", "--media-only", action="store_true") parser.add_option("-m", "--media-only", action="store_true")
parser.add_option("-f", "--html-only", action="store_true")
parser.add_option("-j", "--json-only", action="store_true")
parser.add_option("-w", "--no-web-audio", action="store_true") parser.add_option("-w", "--no-web-audio", action="store_true")
parser.add_option("-j", "--json", action="store_true") parser.add_option("-p", "--print-json", action="store_true")
parser.add_option("-s", "--sleep", action="store", type="int", dest="sleep", default=5)
(options, args) = parser.parse_args() (options, args) = parser.parse_args()
if len(args) == 0: if len(args) == 0:
print "usage: ./ytmnd.py [-u username] [--media-only] [--no-web-audio] [--json] [domain]" parser.error("incorrect number of arguments")
sys.exit(1) sys.exit(1)
ytmnd = YTMND () ytmnd = YTMND ()
ytmnd.user_mode = options.user
ytmnd.media_only = options.media_only ytmnd.media_only = options.media_only
ytmnd.html_only = options.html_only
ytmnd.json_only = options.json_only
ytmnd.no_web_audio = options.no_web_audio ytmnd.no_web_audio = options.no_web_audio
ytmnd.json = options.json ytmnd.print_json = options.print_json
ytmnd.sleep = options.sleep
if options.user: if options.user:
user = args[0] user = args[0]