#! /bin/env python
# -*- coding=gbk -*-

"""
Copyright (c) 2014 Baidu.com, Inc. All Rights Reserved
@Author: wangyan15@baidu.com  2014-08-18
@File: disam_check_diff.py 
@Version: 1.0.0
@Dep: python2.7, mrjob
@Brif: check disam diff 
"""


import os
import sys
import codecs
import math
import base64
import simplejson as json 

import sqlite3dbm 
import mrjob
from mrjob.job import MRJob

from utils.recom_mrjob.binary_pickle_protocol import BinaryPickleProtocol
from mrjob.protocol import RawProtocol,JSONProtocol,RawValueProtocol
from protocol.recommend_protocol import AutoKeySerializor, CardRankerValueProtocol
from protocol.recommend_protocol import CardRankerValueSerializor


def decode_query():
    key_protocol = AutoKeySerializor()
    for line in sys.stdin:
        key = key_protocol.decode(line.strip())
        
        if isinstance(key, tuple) or isinstance(key, list):
            key_level = ((key[0].key, key[0].level), (key[1].key, key[1].level))
        else:
            key_level = (key.key, key.level)
        query = key_level[0]
        print query.encode("gb18030")



class CheckDisamDiffJob(MRJob):
    
    HADOOP_INPUT_FORMAT = "org.apache.hadoop.mapred.CombineTextInputFormat"
    INPUT_PROTOCOL = CardRankerValueProtocol
    OUTPUT_PROTOCOL = RawProtocol
    
    def configure_options(self):
        super(CheckDisamDiffJob, self).configure_options()
        self.add_file_option('--error_term_file')
     
    def check_disam_diff_mapper_init(self):
        #get ambguity term dict
        reload(sys)
        sys.setdefaultencoding("gb18030")
        self.key_serializor = AutoKeySerializor
        self.value_serializor = CardRankerValueSerializor
                     
        file_name = self.options.error_term_file
        self.kv_dic = {}
        for line in open(file_name, "r"):
            line = line.strip()
            uline = unicode(line, "gb18030")
            self.kv_dic[uline] = 1
     
    def check_disam_diff_mapper(self, key, value):
        query = key.key
        #new_key = query + entity_name
        ret_list = []
        for card in value.card_list:
            card_name = card.card_info.card_name
            flag = False
            pos = 0
            for item in card.item_list:
                pos += 1
                term_name = item.entity_name
                term_uri = item.uri
                if term_name in self.kv_dic:
                    ret_list.append((card_name, term_name, term_uri, pos))
        if len(ret_list) > 0:
            yield  self.key_serializor.encode(key), json.dumps(ret_list, encoding='gb18030', ensure_ascii=False, separators=(',', ':')) 
   
    
    def steps(self):
        return [
            self.mr(
                    mapper_init = self.check_disam_diff_mapper_init, 
                    mapper = self.check_disam_diff_mapper,
                    ),
        ]



if __name__ == "__main__":
    #CheckDisamDiffJob.run()
    decode_query()


