python 如何将ppt和word转化为txt文档
最近做了关于计算文档中关键词的程序,使用Tf-idf方法去提取,其中需要使用python读取MS word文档和 MS powerpoint中的内容,现将部分讲解和代码贴出来,请指正。
首先,介绍一下win32com,这是个和window链接的模块,实话说,功能是很强大的,在网上看到很多功能,可以用来打开word,ppt,Excel,Access,模拟浏览器等行为,下载地址:http://starship.python.net/~skippy/win32/Downloads.html,现在,介绍其功能,是从其他网上的资料粘贴过来的。
1.word功能:http://my.oschina.net/duxuefeng/blog/64137(这个写的比较清楚)
- 1 import win32com
- 2 from win32com.client import Dispatch, constants
- 3
- 4 w = win32com.client.Dispatch(\'Word.Application\')
- 5 # 或者使用下面的方法,使用启动独立的进程:
- 6 # w = win32com.client.DispatchEx(\'Word.Application\')
- 7
- 8 # 后台运行,不显示,不警告
- 9 w.Visible = 0
- 10 w.DisplayAlerts = 0
- 11
- 12 # 打开新的文件
- 13 doc = w.Documents.Open( FileName = filenamein )
- 14 # worddoc = w.Documents.Add() # 创建新的文档
- 15
- 16 # 插入文字
- 17 myRange = doc.Range(0,0)
- 18 myRange.InsertBefore(\'Hello from Python!\')
- 19
- 20 # 使用样式
- 21 wordSel = myRange.Select()
- 22 wordSel.Style = constants.wdStyleHeading1
- 23
- 24 # 正文文字替换
- 25 w.Selection.Find.ClearFormatting()
- 26 w.Selection.Find.Replacement.ClearFormatting()
- 27 w.Selection.Find.Execute(OldStr, False, False, False, False, False, True, 1, True, NewStr, 2)
- 28
- 29 # 页眉文字替换
- 30 w.ActiveDocument.Sections[0].Headers[0].Range.Find.ClearFormatting()
- 31 w.ActiveDocument.Sections[0].Headers[0].Range.Find.Replacement.ClearFormatting()
- 32 w.ActiveDocument.Sections[0].Headers[0].Range.Find.Execute(OldStr, False, False, False, False, False, True, 1, False, NewStr, 2)
- 33
- 34 # 表格操作
- 35 doc.Tables[0].Rows[0].Cells[0].Range.Text =\'123123\'
- 36 worddoc.Tables[0].Rows.Add() # 增加一行
- 37
- 38 # 转换为html
- 39 wc = win32com.client.constants
- 40 w.ActiveDocument.WebOptions.RelyOnCSS = 1
- 41 w.ActiveDocument.WebOptions.OptimizeForBrowser = 1
- 42 w.ActiveDocument.WebOptions.BrowserLevel = 0 # constants.wdBrowserLevelV4
- 43 w.ActiveDocument.WebOptions.OrganizeInFolder = 0
- 44 w.ActiveDocument.WebOptions.UseLongFileNames = 1
- 45 w.ActiveDocument.WebOptions.RelyOnVML = 0
- 46 w.ActiveDocument.WebOptions.AllowPNG = 1
- 47 w.ActiveDocument.SaveAs( FileName = filenameout, FileFormat = wc.wdFormatHTML )
- 48
- 49 # 打印
- 50 doc.PrintOut()
- 51
- 52 # 关闭
- 53 # doc.Close()
- 54 w.Documents.Close(wc.wdDoNotSaveChanges)
- 55 w.Quit()
View Code
2. Excel功能:同上的网址
3. Access功能:
- 1 import win32com.client
- 2 oAccess = win32com.client.Dispatch(\'Access.Application\')
- 3 DbFile = r\'C:\Users\hans\Documents\NewDb.mdb\'
- 4 dbLangGeneral = \';LANGID=0x0409;CP=1252;COUNTRY=0\'
- 5 # dbVersion40 64
- 6 dbVersion = 64
- 7 oAccess.DBEngine.CreateDatabase(DbFile, dbLangGeneral, dbVersion)
- 8 oAccess.Quit()
- 9 del oAccess
View Code
4. 模拟浏览器行为:http://www.cnblogs.com/chenzehe/archive/2010/09/01/1814397.html
- 1 Code highlighting produced by Actipro CodeHighlighter (freeware)http://www.CodeHighlighter.com/--># -*- coding:UTF-8 -*-
- 2 #!/user/bin/env python
- 3 \'\'\'
- 4 Created on 2010-9-1
- 5 @author: chenzehe
- 6 \'\'\'
- 7 import win32com.client
- 8 from time import sleep
- 9
- 10 loginurl=\'http://passport.cnblogs.com/login.aspx\'
- 11 loginouturl=\'http://passport.cnblogs.com/logout.aspx\'
- 12 username=\'XXX\'
- 13 password=\'XXX\'
- 14
- 15 ie = win32com.client.Dispatch("InternetExplorer.Application")
- 16 ie.Visible = 0
- 17
- 18
- 19 ie.Navigate(loginurl)
- 20 state = ie.ReadyState
- 21 print "打开登陆页面"
- 22 while 1:
- 23 state = ie.ReadyState
- 24 if state ==4:
- 25 break
- 26 sleep(1)
- 27 print "页面载入完毕,输入用户名密码"
- 28 state = None
- 29
- 30 ie.Document.getElementById("tbUserName").value=username
- 31 ie.Document.getElementById("tbPassword").value=password
- 32 ie.Document.getElementById("btnLogin").click()
- 33
- 34
- 35 while 1:
- 36 state = ie.ReadyState
- 37 print state
- 38 if state ==4 and str(ie.LocationURL) == "http://home.cnblogs.com/":
- 39 break
- 40 sleep(1)
- 41 print "登陆成功"
- 42 print \'你的昵称是:\'
- 43 print ie.Document.getElementById(\'lnk_current_user\').title
- 44
- 45 #博客园只能登录一次,注销
- 46 print \'注销!\'
- 47 ie.Navigate(loginouturl)
View Code
5.播放mp3文件:http://www.sharejs.com/codes/python/5733
- 1 from win32com.client import Dispatch
- 2 mp = Dispatch("WMPlayer.OCX")
- 3 # use an mp3 file you have ...
- 4 #tune = mp.newMedia("C:/Program Files/Common Files/HP/Memories Disc/2.0/audio/Swing.mp3")
- 5 # or copy one to the working folder ...
- 6 #tune = mp.newMedia("Bier1.mp3")
- 7 # you can also play wma files, this cool sound came with XP ...
- 8 tune = mp.newMedia("C:/WINDOWS/system32/oobe/images/title.wma")
- 9 mp.currentPlaylist.appendItem(tune)
- 10 mp.controls.play()
- 11 # to stop playing use
- 12 raw_input("Press Enter to stop playing")
- 13 mp.controls.stop()
View Code
真心感觉这个东西很强大呀!言归正传,该上将word和ppt转化为txt的代码了!如下:
- 1 #coding:utf-8
- 2 import win32com
- 3 import win32con
- 4 import win32gui
- 5 import codecs
- 6 from win32com.client import Dispatch
- 7 import pythoncom
- 8
- 9 class MSOffice2txt():
- 10 def __init__(self, fileType=[\'doc\',\'ppt\']):
- 11 self.docCom = None
- 12 self.pptCom = None
- 13 pythoncom.CoInitialize()
- 14 if type(fileType) is not list:
- 15 return \'Error, please check the fileType, it must be list[]\'
- 16 for ft in fileType:
- 17 if ft == \'doc\':
- 18 self.docCom = self.docApplicationOpen()
- 19 elif ft == \'ppt\':
- 20 self.pptCom = self.pptApplicationOpen()
- 21
- 22 def close(self):
- 23 self.docApplicationClose(self.docCom)
- 24 self.pptApplicationClose(self.pptCom)
- 25
- 26 def docApplicationOpen(self):
- 27 docCom = win32com.client.Dispatch(\'Word.Application\')
- 28 docCom.Visible = 1
- 29 docCom.DisplayAlerts = 0
- 30 docHwnd = win32gui.FindWindow(None, \'Microsoft Word\')
- 31 win32gui.ShowWindow(docHwnd, win32con.SW_HIDE)
- 32 return docCom
- 33
- 34 def docApplicationClose(self,docCom):
- 35 if docCom is not None:
- 36 docCom.Quit()
- 37
- 38 def doc2Txt(self, docCom, docFile, txtFile):
- 39 doc = docCom.Documents.Open(FileName=docFile,ReadOnly=1)
- 40 doc.SaveAs(txtFile, 2)
- 41 doc.Close()
- 42
- 43
- 44
- 45 def pptApplicationOpen(self):
- 46 pptCom = win32com.client.Dispatch(\'PowerPoint.Application\')
- 47 pptCom.Visible = 1
- 48 pptCom.DisplayAlerts = 0
- 49 pptHwnd = win32gui.FindWindow(None, \'Microsoft PowerPoint\')
- 50 win32gui.ShowWindow(pptHwnd, win32con.SW_HIDE)
- 51 return pptCom
- 52
- 53 def pptApplicationClose(self, pptCom):
- 54 if pptCom is not None:
- 55 pptCom.Quit()
- 56
- 57 def ppt2txt(self, pptCom, pptFile, txtFile):
- 58 ppt = pptCom.Presentations.Open(pptFile,ReadOnly=1, Untitled=0, WithWindow=0)
- 59 f = codecs.open(txtFile,"w",\'gb18030\')
- 60 slide_count = ppt.Slides.Count
- 61 for i in xrange(1,slide_count + 1):
- 62 shape_count = ppt.Slides(i).Shapes.Count
- 63 for j in xrange(1,shape_count + 1):
- 64 if ppt.Slides(i).Shapes(j).HasTextFrame:
- 65 s = ppt.Slides(i).Shapes(j).TextFrame.TextRange.Text
- 66 f.write(s)
- 67 f.close()
- 68 ppt.Close()
- 69
- 70 def translate(self, filename, txtFilename):
- 71 if filename.endswith(\'doc\') or filename.endswith(\'docx\'):
- 72 if self.docCom is None:
- 73 self.docCom = self.docApplicationOpen()
- 74 self.doc2Txt(self.docCom, filename, txtFilename)
- 75 return True
- 76 elif filename.endswith(\'ppt\') or filename.endswith(\'pptx\'):
- 77 if self.pptCom is None:
- 78 self.pptCom = self.pptApplicationOpen()
- 79 self.ppt2txt(self.pptCom, filename, txtFilename)
- 80 return True
- 81 else:
- 82 return False
- 83
- 84 if __name__==\'__main__\':
- 85 msoffice = MSOffice2txt()
- 86 filename = u\'F:\\study.docx\'
- 87 if msoffice.translate(filename, \'temp.txt\'):
- 88 print \'Successed!\'
- 89 else:
- 90 print \'Failed!\'
- 91 msoffice.close()