`
yiminghe
  • 浏览: 1432481 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论

python转换文件编码应用

阅读更多

初次写python应用,比较简单的转换文件编码,用来将文件转换到指定编码,主要利用了 open 文件操作,os 目录遍历,chardet 编码探测,解决 movist(多字幕播放器) 只能正确读取utf-8字幕文件问题,一次将目录下所有字幕都转换城utf-8编码.

 

ps:发现使用多线程后,时间反而会上升一倍,看来对于小任务线程还是开销比较大的

 

/Users/yiminghe/code/python/tools/encode.py :

 

# -*- coding: utf-8 -*-
import sys,os,shutil,traceback,time
from chardet.universaldetector import UniversalDetector
#deal with chinese   
encodes={
    "gb2312":"gb18030",
    "gbk":"gb18030"
}
class HeEncodingEx(Exception):
    def __init__(self,msg):
        Exception.__init__(self,msg);
def gb(encoding):
    if encoding is None:
        raise HeEncodingEx,"unknown encoding"
    encoding=encoding.strip().lower()   
    return  encodes[encoding] if encoding in encodes else encoding
   
def transferToEncoding(filename,toCode):
    '''
    save the content of filename to filename with toCode text encoding
    @param filename{string}: text file
    @param toCode{string}: text encoding code ,gbk,utf-8...etc
    @return{boolean}: operation success true/false
    '''
    if(os.path.isdir(filename)):
        print "error:not file"
        return False
        
    try:
        detector = UniversalDetector() 
        #print filename
        #read content
        f=open(filename,"r")
        ls=f.readlines();
        f.close();

        #detect encoding
        for l in ls:
            detector.feed(l)
            if detector.done: break
        detector.close()
        #print detector.result
        #print dir(detector.result)
        encode=gb(detector.result['encoding'])
        #print "original encoding:",encode
        if(encode.lower() != toCode.lower()):
            #backup orginal file
            if not os.path.exists(filename+".bak"):
                shutil.copy(filename, filename+".bak")
            #save to another encoding
            f=open(filename,"w")
            for l in ls:
                f.write(unicode(l,encode).encode(toCode))
            f.close()
            #print "result encoding:"+toCode
        else:
            pass
            #print "same encoding"    
    except BaseException,e:
        #print "error:",e
        traceback.print_exc()
        #restore
        if(os.path.exists(filename+".bak")):
            shutil.copy(filename+".bak", filename)
        return False
    finally:
        print
        print
    return True
    
#main
if __name__=="__main__":
    
    start=time.time()
    
    if len(sys.argv)<2:
        print "erro argv! filename toCoding"
        sys.exit(1)
    #default transfer to utf-8    
    toCode=sys.argv[2] if len(sys.argv) > 2 else "utf-8"
    filename=sys.argv[1]
    if(os.path.isfile(filename)):
        transferToEncoding(filename,toCode)
    else:
        import threading
        #同时10个线程处理文件
        THREAD_NUM=10
        lock=threading._allocate_lock()
        
        
        def fetchAndProcess(files,func):
            '''
            每次取一个元素运行
            @param files{Array}:数据存放数组
            @param func{Function}:处理函数
            '''
            while len(files):
                lock.acquire()
                if len(files)==0:
                    break
                try:
                    file_=files.pop()
                except IndexError,e:
                    print e
                    break
                print threading.current_thread().ident," got : ",file_         
                lock.release()
                func(file_,toCode)
                
        #folder? then walk
        all_files=[]
        for base,folders,files in os.walk(filename):
            if not base.endswith(os.sep) :
                base+=os.sep
            for file_ in files:
                if file_.lower().endswith("srt"):
                    all_files.append(base+file_)
                    if 0:
                        transferToEncoding(base+file_,toCode)
        if 1:
            num=THREAD_NUM
            threads=[];
            #print all_files;
            while num:
                num-=1
                threads.append(threading.Thread(target=fetchAndProcess,args=(all_files,transferToEncoding)))
        
            for thread_ in threads:
                thread_.start()
            
            for thread_ in threads:
                thread_.join()
        
        #10,20,30个线程 40。6秒
        #单线程 28.1秒        
        print "consume time :",time.time()-start    
     
 

 

/Users/yiminghe/code/python/tools/ he_encode.sh:

#!/bin/bash -
#"$@" ,not $* ,$@,"$*"
python /Users/yiminghe/code/python/tools/encode.py "$@"
 

创建软链接

chmod 777 /Users/yiminghe/code/python/tools/he_encode.sh
ln -s /Users/yiminghe/code/python/tools/he_encode.sh /usr/bin 

 

运行:

 

定位在某个目录下运即可:转换目录下的所有字幕文件为utf-8格式

 

he_encode .
 

 

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics