#!/usr/bin/env python
# coding: utf-8

# BA 4chan-thread-archiver
#
# Built for the Bibliotheca Anonoma by Lawrence Wu, 2012/04/04
# Rewritten from scratch to use py4chan API wrapper, save in seperate images folder, download plain HTML, modularization, comments, and code cleanup
# Formerly based on https://github.com/socketubs/4chandownloader

#
# Initial release Nov. 5, 2009
# v6 release Jan. 20, 2009
# http://cal.freeshell.org
#
# Refactor, update and Python package
# by Socketubs (http://socketubs.net/)
# 09-08-12
#

import os
import time
import json
import re
import errno
import requests0 as requests
import py4chan



"""=== Docopt Arguments and Documentation ==="""

from docopt import docopt

doc = """BA-4chan-thread-archiver. Uses the 4chan API (with the py4chan wrapper) 
to download thread images and/or thumbnails, along with thread HTML, JSON,
and a list of referenced external links.

Usage:
  4chan-thread-archiver <url> [--path=<string>] [--delay=<int>] [--nothumbs] [--thumbsonly]
  4chan-thread-archiver -h | --help
  4chan-thread-archiver -v | --version

Options:
  --nothumbs          Don't download thumbnails
  --thumbsonly        Download thumbnails, no images
  --delay=<int>       Delay between thread checks [default: 20]
  -h --help           Show help
  -v --version        Show version
"""



"""=== Constant Variables and Domain Names to Use ==="""

""" Important Message Front Tag """
_TAG = " :: "

""" HTTP header: use SSL by default """
HTTP_HEADER = 'https://'
#HTTP_HEADER = #'http://'

""" 4chan top level domain names """
FOURCHAN_BOARDS = 'boards.4chan.org'
FOURCHAN_CDN = '4cdn.org'

"""4chan Content Delivery Network domain names (for images, thumbs, api)"""
FOURCHAN_API = 'a.' + FOURCHAN_CDN
FOURCHAN_IMAGES = 'i.' + FOURCHAN_CDN
FOURCHAN_THUMBS = 't.' + FOURCHAN_CDN
FOURCHAN_STATIC = 's.' + FOURCHAN_CDN

"""Original 4chan cdn links, changed 20131202. Commented here for compatibility purposes.
http://chrishateswriting.com/post/68794699432/small-things-add-up"""
#FOURCHAN_API = 'api.4chan.org'
#FOURCHAN_IMAGES = 'images.4chan.org'
#FOURCHAN_THUMBS = 'thumbs.4chan.org'
#FOURCHAN_STATIC = 'static.4chan.org'

"""Full HTTP Links to 4chan servers. Format is (boards, object_id)"""
FOURCHAN_BOARDS_URL = HTTP_HEADER + FOURCHAN_BOARDS + '/%s/res/%s'
FOURCHAN_API_URL = HTTP_HEADER + FOURCHAN_API + '/%s/res/%s.json'
FOURCHAN_IMAGES_URL = HTTP_HEADER + FOURCHAN_IMAGES + '/%s/src/%s'
FOURCHAN_THUMBS_URL = HTTP_HEADER + FOURCHAN_THUMBS + '/%s/thumb/%s'

""" default folder names for image and thumbnails """
_DEFAULT_FOLDER = "4chan"
_IMAGE_DIR_NAME = "images"
_THUMB_DIR_NAME = "thumbs"
_CSS_DIR_NAME = "css"



"""=== Functions ==="""

def make_sure_path_exists(path):
    """ Recursively create folder paths if they don't exist 

    (replace) with `os.makedirs(path,exist_ok=True)` in python3

    """
    try:
        os.makedirs(path)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise



def download_file(fname, dst_folder, file_url):
    """ (FIXME) Download any file using requests0. Needs update """
    
    # Destination of downloaded file
    file_dst = os.path.join(dst_folder, fname)
    
    # If the file doesn't exist, download it
    if not os.path.exists(file_dst):
        print('%s downloading...' % fname)
        i = requests.get(file_url)
        if i.status_code == 404:
            print(' | Failed, try later (%s)' % file_url)
        else:
            open(file_dst, 'w').write(i.content)
    else:
        print('%s already downloaded' % fname)



def file_replace(fname, pat, s_after):
    """ File in place regex function

    originally scripted by steveha on StackOverflow: http://stackoverflow.com/questions/1597649/replace-strings-in-files-by-python

    Notice: `\1` notation could be interpreted by python as `\x01`! Escape it with a second backslash: `\\1`

    """
    # first, see if the pattern is even in the file.
    with open(fname) as f:
        if not any(re.search(pat, line) for line in f):
            return # pattern does not occur in file so we are done.

    # pattern is in the file, so perform replace operation.
    with open(fname) as f:
        out_fname = fname + ".tmp"
        out = open(out_fname, "w")
        for line in f:
            out.write(re.sub(pat, s_after, line))
        out.close()
        # FIXME Works on Linux only. Has a rename error in Windows
        os.rename(out_fname, fname)



def dump_css(dst_dir):
    """ Dumps the CSS from 4cdn.

    (FIXME) Currently uses a static list of links, which works but is not ideal. Eventually, we need to create a JSON HTML Templater system.

    """
    fourchan_css_regex = re.compile("https?://" + FOURCHAN_STATIC + "/css/(\w+)\.\d+.css")
    
    # (FUTURE) Mod dump_css() to automatically scrape CSS links each time.
    css_list = [HTTP_HEADER + FOURCHAN_STATIC + "/css/yotsubluemobile.473.css",
    HTTP_HEADER + FOURCHAN_STATIC + "/css/yotsubluenew.473.css", 
    HTTP_HEADER + FOURCHAN_STATIC + "/css/yotsubanew.473.css", 
    HTTP_HEADER + FOURCHAN_STATIC + "/css/futabanew.473.css", 
    HTTP_HEADER + FOURCHAN_STATIC + "/css/burichannew.473.css", 
    HTTP_HEADER + FOURCHAN_STATIC + "/css/photon.473.css",
    HTTP_HEADER + FOURCHAN_STATIC + "/css/tomorrow.473.css"]
    
    for css_url in css_list:
      css_name = re.sub(fourchan_css_regex, "\\1.css", css_url)
      download_file(css_name, dst_dir, css_url)



def dump_html(dst_dir, board, thread):
    """ Dumps thread in raw HTML format to `<thread-id>.html` """
    
    fourchan_images_regex = re.compile("https?://" + FOURCHAN_IMAGES + "/\w+/src/")
    fourchan_thumbs_regex = re.compile("https?://\d+." + FOURCHAN_THUMBS + "/\w+/thumb/")
    html_filename = "%s.html" % thread
    html_url = FOURCHAN_BOARDS_URL % (board, thread)
    download_file(html_filename, dst_dir, html_url)
    
    # Convert all links in HTML dump to use locally downloaded files
    html_path = os.path.join(dst_dir, html_filename)
    file_replace(html_path, '"//', '"http://')
    file_replace(html_path, fourchan_images_regex, _IMAGE_DIR_NAME + "/")
    file_replace(html_path, fourchan_thumbs_regex, _THUMB_DIR_NAME + "/")
    
    # Download a local copy of all CSS files
    dst_css_dir = os.path.join(dst_dir, _CSS_DIR_NAME)
    make_sure_path_exists(dst_css_dir)
    dump_css(dst_css_dir)
    
    # convert HTML links to use local CSS files that we just downloaded
    # (FIXME) Might want to mod the HTML to use only ONE CSS file (perhaps by option)
    file_replace(html_path, "https?://" + FOURCHAN_STATIC + "/css/(\w+)\.\d+.css", "css/\\1.css")



def dump_json(dst_dir, board, thread):
    """ Grab thread JSON from 4chan API """
    
    json_filename = "%s.json" % thread
    json_path = os.path.join(dst_dir, json_filename)
    
    json_thread = requests.get(FOURCHAN_API_URL % (board, thread))
    json.dump(json_thread.json, open(json_path, 'w'), sort_keys=True, indent=2, separators=(',', ': '))



def list_external_links(curr_thread, dst_dir):
    """ Get all external links quoted in comments """
    
    # `The Ultimate URL Regex` <http://stackoverflow.com/questions/520031/whats-the-cleanest-way-to-extract-urls-from-a-string-using-python>_
    linkregex = re.compile(r"""((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.‌​][a-z]{2,4}/)(?:[^\s()<>]+|(([^\s()<>]+|(([^\s()<>]+)))*))+(?:(([^\s()<>]+|(‌​([^\s()<>]+)))*)|[^\s`!()[]{};:'".,<>?«»“”‘’]))""", re.DOTALL)

    # File to store list of all external links quoted in comments (overwrite upon each loop iteration)
    linklist_dst = os.path.join(dst_dir, "external_links.txt")
    linklist_file = open(linklist_dst, "w")

    for reply in curr_thread.replies:
      if (reply.Comment == None):
          continue
      if not linkregex.search(reply.Comment):
          continue
      else:
          # We need to get rid of all <wbr> tags before parsing
          cleaned_com = re.sub(r'\<wbr\>', '', reply.Comment)
          linklist = re.findall(linkregex, cleaned_com)
          for item in linklist:
              print("Found URL, saving in %s:\n%s\n" % (linklist_dst, item[0]))
              linklist_file.write(item[0])	# re.findall creates tuple
              linklist_file.write('\n')	# subdivide with newlines

    # Close linklist file after loop
    linklist_file.close()



def get_files(dst_dir, curr_thread, nothumbs, thumbsonly):
    """ Download images, thumbs, and gather links to external urls """

    # Create and set destination folders
    dst_images_dir = os.path.join(dst_dir, _IMAGE_DIR_NAME)
    make_sure_path_exists(dst_images_dir)
    dst_thumbs_dir = os.path.join(dst_dir, _THUMB_DIR_NAME)
    make_sure_path_exists(dst_thumbs_dir)
    
    # regex for obtaining filenames from py4chan (new py4chan)
    fourchan_images_regex = re.compile("https?://" + FOURCHAN_IMAGES + "/\w+/src/")
    fourchan_thumbs_regex = re.compile("https?://\d+." + FOURCHAN_THUMBS + "/\w+/thumb/")
    
    # regex for obtaining filenames from py4chan (legacy)
    #fourchan_images_regex = re.compile("https?://images.4chan.org/\w+/src/")
    #fourchan_thumbs_regex = re.compile("https?://\d+.thumbs.4chan.org/\w+/thumb/")
    
    if (thumbsonly == False):
      # Dump all images within a thread from 4chan
      for image_url in curr_thread.Files():
        image_name = re.sub(fourchan_images_regex, '', image_url)
        download_file(image_name, dst_images_dir, image_url)

    if thumbsonly or (nothumbs == False):
      # Dump all thumbnails within a thread from 4chan
      for thumb_url in curr_thread.Thumbs():
        thumb_name = re.sub(fourchan_thumbs_regex, '', thumb_url)
        download_file(thumb_name, dst_thumbs_dir, thumb_url)



def dump(dst_dir, board, thread, curr_thread, nothumbs, thumbsonly):
    """ Dump the thread using the functions defined above """
    
    # Create paths if they don't exist
    make_sure_path_exists(dst_dir)
    
    # Download images and thumbs 
    get_files(dst_dir, curr_thread, nothumbs, thumbsonly)
    
    # Get all external links quoted in comments
    list_external_links(curr_thread, dst_dir)

    # Dumps thread in raw HTML format to `<thread-id>.html`
    dump_html(dst_dir, board, thread)
    
    # Dumps thread in JSON format to `<thread-id>.json` file, pretty printed
    dump_json(dst_dir, board, thread)



def timestamp():
    """ `Timestamp` <http://www.interfaceware.com/manual/timestamps_with_milliseconds.html>_ """
    now = time.time()
    localtime = time.localtime(now)
    return time.strftime('%Y-%m-%d %H:%M:%S', localtime)



"""=== Main Function ==="""

def main(args):
    """ Check 4chan API for new content, and recursively dump thread """
    
    # Message to display upon completion
    _DUMP_COMPLETE_STRING = " :: Dump complete. To resume dumping the same thread,\nrun this script again."
    
    # Copy data from docopt arguments
    thread = args.get('<url>').split('/')[5]
    board  = args.get('<url>').split('/')[3]
    path   = args.get('--path')
    nothumbs = args.get('--nothumbs', False)
    thumbsonly = args.get('--thumbsonly', False)
    delay  = args.get('--delay')


    # Set a default path if none is given
    if (path == None):
      path = os.path.join(os.getcwd() + os.path.sep + _DEFAULT_FOLDER)
      
    # Set destination directory
    dst_dir = os.path.join(path, board, thread)


    # Initialize py4chan Board object
    curr_board = py4chan.Board(board)
    
    # Check if the thread exists, then create py4chan thread object
    if (curr_board.threadExists(int(thread))):
      curr_thread = curr_board.getThread(int(thread))
    else:
      # Stop the script if the thread is not found
      print(_TAG + "Thread %s not found." % thread)
      print(_TAG + "Either the thread already 404'ed, your URL is incorrect, or you aren't connected to the internet")
      raise SystemExit(0)


    # header
    print(_TAG + 'Board : 4chan /%s/' % board)
    print(_TAG + 'Thread: %s' % thread)
    print(_TAG + 'Folder: %s' % dst_dir)


    # Using try/except loop to handle Ctrl-C
    try:
      # Switch to check for first run
      firstIteration = True
      
      while 1:
          # don't run this code the first time
          if (firstIteration == False):
              # Wait to execute code again
              print("\n" + _TAG + "Waiting %s seconds before retrying (Type Ctrl-C to stop)" % delay)
              time.sleep(int(delay))
              
              if curr_thread.is_404:
                # Stop when thread gets 404'ed
                print(_TAG + "%s - [Thread 404'ed or Connection Lost]" % timestamp())
                print(" :: Dump complete. To resume dumping the same thread,\nrun this script again.")
                raise SystemExit(0)
                
              
              # Update thread and check if new replies have appeared
              newReplies = curr_thread.update()
              if (newReplies == 0):
                print(_TAG + "%s - [No new posts.]" % timestamp())
                continue
              else:
                print(_TAG + "%s - [%s new post(s) found!]" % (timestamp(), newReplies))
                
              # If all tests are OK, dump thread again
              dump(dst_dir, board, thread, curr_thread, nothumbs, thumbsonly)
                
          else:
              # dump thread for the first time
              dump(dst_dir, board, thread, curr_thread, nothumbs, thumbsonly)
              
              # first iteration is complete
              firstIteration = False
          
    except KeyboardInterrupt:
      """ Stop try/except loop when [Ctrl-C] is pressed"""
      print("\n")
      print(" :: Dump complete. To resume dumping the same thread,\nrun this script again.")
      raise SystemExit(0)



""" Use docopt to get arguments, and run main function """
if __name__ == '__main__':
  args = docopt(doc, version=0.3)
  main(args)
