#!/usr/bin/python
# coding: utf8
import re
import urllib2
from collections import defaultdict, OrderedDict
from lxml import html
import pandas

"""
財務分析.jpから財務データを取得しpandasの表データに変換するライブラリ
"""
BASE_URL = "http://www.financial-analysis.jp/component/stock/?view=stock&id=%s"
YEAR_PAT = re.compile(u'\d\d\d\d年', re.UNICODE)
ORDER_PAT = re.compile(u'\d+位', re.UNICODE)
FLOAT_PAT = re.compile(r'-?[1-9][\d,]*.?\d*')
INT_PAT = re.compile(r'-?[1-9][\d,]*')

def complete_match(pat, s):
    """
    正規表現と完全に一致した文字列かどうかを判定する
    """
    if type(pat) == str:
        obj = re.match(pat, s)
    else:
        obj = pat.match(s)

    if obj is None:
        return False
    elif obj.group(0) == s:
        return True
    else:
        return False

def table_to_list(table, text_parser=None):
    dct = table_to_2d_dict(table, text_parser)
    return list(iter_2d_dict(dct))

def table_to_2d_dict(table, text_parser=None, strip=True):
    result = defaultdict(lambda : defaultdict(unicode))
    for row_i, row in enumerate(table.xpath('.//tr')):
        for col_i, col in enumerate(row.xpath('.//td|.//th')):
            colspan = int(col.get('colspan', 1))
            rowspan = int(col.get('rowspan', 1))
            col_data = col.text_content()
            while row_i in result and col_i in result[row_i]:
                col_i += 1
            for i in range(row_i, row_i + rowspan):
                for j in range(col_i, col_i + colspan):
                    if strip == True:
                        col_data = col_data.strip()
                    if text_parser is None:
                        result[i][j] = col_data
                    else:
                        result[i][j] = text_parser(col_data)
    return result

def iter_2d_dict(dct):
    for i, row in sorted(dct.items()):
        cols = []
        for j, col in sorted(row.items()):
            cols.append(col)
        yield cols

def parser_for_ana(text):
    if text == "-":
        return None
    elif complete_match(INT_PAT, text):
        return int(text.replace(",", ""))
    elif complete_match(FLOAT_PAT, text):
        return float(text.replace(",", ""))
    else:
        return text

def parser_for_3tbl(text):
    if complete_match(YEAR_PAT, text):
        return int(text.replace(u"年", ""))
    elif complete_match(ORDER_PAT, text):
        return int(text.replace(u"位", ""))
    elif complete_match(INT_PAT, text):
        return int(text.replace(",", ""))
    elif complete_match(FLOAT_PAT, text):
        return float(text.replace(",", ""))
    else:
        return text

def merge_list(list0, list1):
    """
    リストの各要素をマージしていく
    マージ方法は単純な足し算を行う
    """
    list01 = []
    for l0, l1 in zip(list0, list1):
        if l0 != l1:
            list01.append(l0 + l1)
        else:
            list01.append(l0)
    return list01

def get_tables(code, proxies=None):
    """
    財務諸表を取得

    :param code: 企業コード
    :type code: str
    :return: pandasのDataFrameに格納された財務諸表データ
    """
    if not proxies is None:
        proxyHandler = urllib2.ProxyHandler(proxies)
        opener = urllib2.build_opener(proxyHandler)
        urllib2.install_opener(opener)
    doc = html.fromstring(urllib2.urlopen(BASE_URL % code).read())
    fieldset = doc.xpath("//fieldset")
    tables = OrderedDict()
    for title, table in zip(fieldset[1].xpath("./div/ul[@class='mootabs_title']/li/span"),
                            fieldset[1].xpath("./div/div/table/tr/td[@valign='top']/table")):
        list_table = table_to_list(table, parser_for_ana)
        header = merge_list(list_table[0], list_table[1])
        body = list_table[2:]
        columns = [b[0] for b in body]
        body = [b[1:] for b in body]
        tables[title.text.split('.')[-1]] = pandas.DataFrame(body, columns=header[1:], index=columns)
    for table in fieldset[-1].xpath(".//table"):
        list_table = table_to_list(table, parser_for_3tbl)
        header = merge_list(list_table[0], list_table[1])
        body = list_table[2:]
        columns = [b[0] for b in body]
        body = [b[1:] for b in body]
        tables[header[0]] = pandas.DataFrame(body[:-1], columns=header[1:], index=columns[:-1])
    return tables

if __name__ == "__main__":
    tables = get_tables("1301")
    for k, v in tables.items():
        print "****** ", k, " *************************"
        print v
