#!/usr/bin/env python
# coding: utf-8

# BA 4chan-thread-archiver
#
# Built for the Bibliotheca Anonoma by Lawrence Wu, 2012/04/04
# Rewritten from scratch to use py4chan API wrapper, save in seperate images folder, download plain HTML, modularization, comments, and code cleanup
# Formerly based on https://github.com/socketubs/4chandownloader

#
# Initial release Nov. 5, 2009
# v6 release Jan. 20, 2009
# http://cal.freeshell.org
#
# Refactor, update and Python package
# by Socketubs (http://socketubs.net/)
# 09-08-12
#

import os
import time
import json
import re
import errno
import requests

from docopt import docopt

doc = """4chan-thread-archiver-old, uses 4chan API (without py4chan wrapper) 
to download thread images and/or thumbnails, along with thread HTML, JSON,
and a list of referenced external links.

Usage:
  4chan-thread-archiver-orig <url> [--path=<string>] [--delay=<int>] [--nothumbs] [--thumbsonly]
  4chan-thread-archiver-orig -h | --help
  4chan-thread-archiver-prog -v | --version

Options:
  --nothumbs          Don't download thumbnails
  --thumbsonly        Download thumbnails, no images
  --delay=<int>       Delay between thread checks [default: 20]
  -h --help           Show help
  -v --version        Show version
"""

"""=== Constant Variables and Domain Names to Use ==="""

""" Important Message Front Tag """
_TAG = " :: "

""" HTTP header: use SSL by default """
HTTP_HEADER = 'https://'
#HTTP_HEADER = #'http://'

""" 4chan top level domain names """
FOURCHAN_BOARDS = 'boards.4chan.org'
FOURCHAN_CDN = '4cdn.org'

"""4chan Content Delivery Network domain names (for images, thumbs, api)"""
FOURCHAN_API = 'a.' + FOURCHAN_CDN
FOURCHAN_IMAGES = 'i.' + FOURCHAN_CDN
FOURCHAN_THUMBS = 't.' + FOURCHAN_CDN
FOURCHAN_STATIC = 's.' + FOURCHAN_CDN

"""Original 4chan cdn links, changed 20131202. Commented here for compatibility purposes.
http://chrishateswriting.com/post/68794699432/small-things-add-up"""
#FOURCHAN_API = 'api.4chan.org'
#FOURCHAN_IMAGES = 'images.4chan.org'
#FOURCHAN_THUMBS = 'thumbs.4chan.org'
#FOURCHAN_STATIC = 'static.4chan.org'

"""Full HTTP Links to 4chan servers. Format is (boards, object_id)"""
FOURCHAN_BOARDS_URL = HTTP_HEADER + FOURCHAN_BOARDS + '/%s/res/%s'
FOURCHAN_API_URL = HTTP_HEADER + FOURCHAN_API + '/%s/res/%s.json'
FOURCHAN_IMAGES_URL = HTTP_HEADER + FOURCHAN_IMAGES + '/%s/src/%s'
FOURCHAN_THUMBS_URL = HTTP_HEADER + FOURCHAN_THUMBS + '/%s/thumb/%s'

""" default folder names for image and thumbnails """
_DEFAULT_FOLDER = "4chan"
_IMAGE_DIR_NAME = "images"
_THUMB_DIR_NAME = "thumbs"
_CSS_DIR_NAME = "css"

_DUMP_COMPLETE_STRING = "Dump complete. To resume dumping the same thread,\nrun this script again."

# Recursively create paths if they don't exist 
# replace with `os.makedirs(path,exist_ok=True)` in python3
def make_sure_path_exists(path):
    try:
      os.makedirs(path)
    except OSError as exception:
      if exception.errno != errno.EEXIST:
        raise

# Download any file using requests
def download_file(fname, dst_folder, file_url):
    # Destination of downloaded file
    file_dst = os.path.join(dst_folder, fname)
    
    # If the file doesn't exist, download it
    if not os.path.exists(file_dst):
      print('%s downloading...' % fname)
      i = requests.get(file_url)
      if i.status_code == 404:
        print(' | Failed, try later (%s)' % file_url)
      else:
        open(file_dst, 'w').write(i.content)
    else:
      print('%s already downloaded' % fname)

def dump_images(post, dst_dir, board):
    dst_images_dir = os.path.join(dst_dir, IMAGE_DIR_NAME)
    make_sure_path_exists(dst_images_dir)
    
    image_name = '%s%s' % (post['tim'], post['ext'])
    image_url  = FOURCHAN_IMAGES_URL % (board, image_name)
    
    download_file(image_name, dst_images_dir, image_url)

# Dump all thumbnails within a thread from 4chan, using `post` from 
def dump_thumbs(post, dst_dir, board):
    dst_thumbs_dir = os.path.join(dst_dir, THUMB_DIR_NAME)
    make_sure_path_exists(dst_thumbs_dir)
    
    thumb_name = '%ss.jpg' % post['tim']
    thumb_url = FOURCHAN_THUMBS_URL % (board, thumb_name)
    
    download_file(thumb_name, dst_thumbs_dir, thumb_url)

# File in place regex function originally scripted by steveha on StackOverflow: 
# http://stackoverflow.com/questions/1597649/replace-strings-in-files-by-python
# Notice: `\1` notation could be interpreted by python as `\x01`! Escape it with a second backslash: `\\1`  
def file_replace(fname, pat, s_after):
    # first, see if the pattern is even in the file.
    with open(fname) as f:
      if not any(re.search(pat, line) for line in f):
        return              # pattern does not occur in file so we are done.

    # pattern is in the file, so perform replace operation.
    with open(fname) as f:
      
      out_fname = fname + ".tmp"
      out = open(out_fname, "w")
      for line in f:
        out.write(re.sub(pat, s_after, line))
        
      out.close()
      os.rename(out_fname, fname)

def dump_css(dst_dir):
    """ Dumps the CSS from 4cdn.

    (FIXME) Currently uses a static list of links, which works but is not ideal. Eventually, we need to create a JSON HTML Templater system.

    """
    fourchan_css_regex = re.compile("https?://" + FOURCHAN_STATIC + "/css/(\w+)\.\d+.css")
    
    # (FUTURE) Mod dump_css() to automatically scrape CSS links each time.
    css_list = [HTTP_HEADER + FOURCHAN_STATIC + "/css/yotsubluemobile.473.css",
    HTTP_HEADER + FOURCHAN_STATIC + "/css/yotsubluenew.473.css", 
    HTTP_HEADER + FOURCHAN_STATIC + "/css/yotsubanew.473.css", 
    HTTP_HEADER + FOURCHAN_STATIC + "/css/futabanew.473.css", 
    HTTP_HEADER + FOURCHAN_STATIC + "/css/burichannew.473.css", 
    HTTP_HEADER + FOURCHAN_STATIC + "/css/photon.473.css",
    HTTP_HEADER + FOURCHAN_STATIC + "/css/tomorrow.473.css"]
    
    for css_url in css_list:
      css_name = re.sub(fourchan_css_regex, "\\1.css", css_url)
      download_file(css_name, dst_dir, css_url)


def dump_html(dst_dir, board, thread):
    """ Dumps thread in raw HTML format to `<thread-id>.html` """
    
    fourchan_images_regex = re.compile("https?://" + FOURCHAN_IMAGES + "/\w+/src/")
    fourchan_thumbs_regex = re.compile("https?://\d+." + FOURCHAN_THUMBS + "/\w+/thumb/")
    html_filename = "%s.html" % thread
    html_url = FOURCHAN_BOARDS_URL % (board, thread)
    download_file(html_filename, dst_dir, html_url)
    
    # Convert all links in HTML dump to use locally downloaded files
    html_path = os.path.join(dst_dir, html_filename)
    file_replace(html_path, '"//', '"http://')
    file_replace(html_path, fourchan_images_regex, _IMAGE_DIR_NAME + "/")
    file_replace(html_path, fourchan_thumbs_regex, _THUMB_DIR_NAME + "/")
    
    # Download a local copy of all CSS files
    dst_css_dir = os.path.join(dst_dir, _CSS_DIR_NAME)
    make_sure_path_exists(dst_css_dir)
    dump_css(dst_css_dir)
    
    # convert HTML links to use local CSS files that we just downloaded
    # (FIXME) Might want to mod the HTML to use only ONE CSS file (perhaps by option)
    file_replace(html_path, "https?://" + FOURCHAN_STATIC + "/css/(\w+)\.\d+.css", "css/\\1.css")

# Grab thread JSON from 4chan API
def dump_json(dst_dir, board, thread):
    json_filename = "%s.json" % thread
    json_path = os.path.join(dst_dir, json_filename)
    
    json_thread = requests.get(FOURCHAN_API_URL % (board, thread))
    json.dump(json_thread.json(), open(json_path, 'w'), sort_keys=True, indent=2, separators=(',', ': '))
	  
# Get all external links quoted in comments
def list_external_links(json_thread, dst_dir):
  
    # The Ultimate URL Regex
    # http://stackoverflow.com/questions/520031/whats-the-cleanest-way-to-extract-urls-from-a-string-using-python
    linkregex = re.compile(r"""((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.‌​][a-z]{2,4}/)(?:[^\s()<>]+|(([^\s()<>]+|(([^\s()<>]+)))*))+(?:(([^\s()<>]+|(‌​([^\s()<>]+)))*)|[^\s`!()[]{};:'".,<>?«»“”‘’]))""", re.DOTALL)

    # File to store list of all external links quoted in comments (overwrite upon each loop iteration)
    linklist_dst = os.path.join(dst_dir, "external_links.txt")
    linklist_file = open(linklist_dst, "w")

    for post in json_thread.json['posts']:
      # comments are optional in 4chan API
      if "com" not in post:
        continue
      
      # skip to the next post if links are not found
      if not linkregex.search(post['com']):
        continue
      
      else:
        # We need to get rid of all <wbr> tags before parsing
        cleaned_com = re.sub(r'\<wbr\>', '', post['com'])
        linklist = re.findall(linkregex, cleaned_com)
        for item in linklist:
          print("Found URL, saving in %s:\n%s\n" % (linklist_dst, item[0]))
          linklist_file.write(item[0])	# re.findall creates tuple
          linklist_file.write('\n')	# subdivide with newlines

    # Close linklist file after loop
    linklist_file.close()

# Download images, thumbs, and gather links to external urls
def get_files(dst_dir, board, thread, nothumbs, thumbsonly):
    
    # Grab thread JSON from 4chan API
    json_thread = requests.get(FOURCHAN_API_URL % (board, thread))
    
    for post in json_thread.json['posts']:
      if post.get('filename', False):
        # If file is deleted, move on
        if post.get('filedeleted', False):
          continue

        # Download images, if not only downloading thumbnails
        if not thumbsonly:
          dump_images(post, dst_dir, board)

        # Download thumbnails by default, but not if user doesn't want it
        if thumbsonly or (not nothumbs):
          dump_thumbs(post, dst_dir, board)

    # Get all external links quoted in comments
    list_external_links(json_thread, dst_dir)

def main(args):
  
    # Copy data from arguments
    thread = args.get('<url>').split('/')[5]
    board  = args.get('<url>').split('/')[3]
    path   = args.get('--path')
    nothumbs = args.get('--nothumbs', False)
    thumbsonly = args.get('--thumbsonly', False)
    delay  = args.get('--delay')

    # Set a default path if none is given
    if (path == None):
      path = os.path.join(os.getcwd() + os.path.sep + _DEFAULT_FOLDER)

    # try/except loop to handle Ctrl-C
    try:
      # Begin file dump loop
      while 1:
        print(' :: Board: %s' % board)
        print(' :: Thread: %s' % thread)
        
        # Create paths if they don't exist
        dst_dir = os.path.join(path, board, thread)
        make_sure_path_exists(dst_dir)
        
        # Download images, thumbs, and gather links to external urls 
        get_files(dst_dir, board, thread, nothumbs, thumbsonly)

        # Dumps thread in raw HTML format to `<thread-id>.html`
        dump_html(dst_dir, board, thread)
        
        # Dumps thread in JSON format to `<thread-id>.json` file, pretty printed
        dump_json(dst_dir, board, thread)
        
        # Wait to execute code again
        print("Waiting %s seconds before retrying (Type Ctrl-C to stop)\n" % delay)
        time.sleep(int(delay))

    except KeyboardInterrupt:
      print("\n")
      print(_DUMP_COMPLETE_STRING)
      raise SystemExit(0)
    
    print(_DUMP_COMPLETE_STRING)

if __name__ == '__main__':
  args = docopt(doc, version=0.3)
  main(args)
