最近做了关于计算文档中关键词的程序,使用Tf-idf方法去提取,其中需要使用python读取MS word文档和 MS powerpoint中的内容,现将部分讲解和代码贴出来,请指正。

  首先,介绍一下win32com,这是个和window链接的模块,实话说,功能是很强大的,在网上看到很多功能,可以用来打开word,ppt,Excel,Access,模拟浏览器等行为,下载地址:http://starship.python.net/~skippy/win32/Downloads.html,现在,介绍其功能,是从其他网上的资料粘贴过来的。

1.word功能:http://my.oschina.net/duxuefeng/blog/64137(这个写的比较清楚) 

  1. 1 import win32com
  2. 2 from win32com.client import Dispatch, constants
  3. 3
  4. 4 w = win32com.client.Dispatch(\'Word.Application\')
  5. 5 # 或者使用下面的方法,使用启动独立的进程:
  6. 6 # w = win32com.client.DispatchEx(\'Word.Application\')
  7. 7
  8. 8 # 后台运行,不显示,不警告
  9. 9 w.Visible = 0
  10. 10 w.DisplayAlerts = 0
  11. 11
  12. 12 # 打开新的文件
  13. 13 doc = w.Documents.Open( FileName = filenamein )
  14. 14 # worddoc = w.Documents.Add() # 创建新的文档
  15. 15
  16. 16 # 插入文字
  17. 17 myRange = doc.Range(0,0)
  18. 18 myRange.InsertBefore(\'Hello from Python!\')
  19. 19
  20. 20 # 使用样式
  21. 21 wordSel = myRange.Select()
  22. 22 wordSel.Style = constants.wdStyleHeading1
  23. 23
  24. 24 # 正文文字替换
  25. 25 w.Selection.Find.ClearFormatting()
  26. 26 w.Selection.Find.Replacement.ClearFormatting()
  27. 27 w.Selection.Find.Execute(OldStr, False, False, False, False, False, True, 1, True, NewStr, 2)
  28. 28
  29. 29 # 页眉文字替换
  30. 30 w.ActiveDocument.Sections[0].Headers[0].Range.Find.ClearFormatting()
  31. 31 w.ActiveDocument.Sections[0].Headers[0].Range.Find.Replacement.ClearFormatting()
  32. 32 w.ActiveDocument.Sections[0].Headers[0].Range.Find.Execute(OldStr, False, False, False, False, False, True, 1, False, NewStr, 2)
  33. 33
  34. 34 # 表格操作
  35. 35 doc.Tables[0].Rows[0].Cells[0].Range.Text =\'123123\'
  36. 36 worddoc.Tables[0].Rows.Add() # 增加一行
  37. 37
  38. 38 # 转换为html
  39. 39 wc = win32com.client.constants
  40. 40 w.ActiveDocument.WebOptions.RelyOnCSS = 1
  41. 41 w.ActiveDocument.WebOptions.OptimizeForBrowser = 1
  42. 42 w.ActiveDocument.WebOptions.BrowserLevel = 0 # constants.wdBrowserLevelV4
  43. 43 w.ActiveDocument.WebOptions.OrganizeInFolder = 0
  44. 44 w.ActiveDocument.WebOptions.UseLongFileNames = 1
  45. 45 w.ActiveDocument.WebOptions.RelyOnVML = 0
  46. 46 w.ActiveDocument.WebOptions.AllowPNG = 1
  47. 47 w.ActiveDocument.SaveAs( FileName = filenameout, FileFormat = wc.wdFormatHTML )
  48. 48
  49. 49 # 打印
  50. 50 doc.PrintOut()
  51. 51
  52. 52 # 关闭
  53. 53 # doc.Close()
  54. 54 w.Documents.Close(wc.wdDoNotSaveChanges)
  55. 55 w.Quit()

View Code

2. Excel功能:同上的网址

3. Access功能:

  1. 1 import win32com.client
  2. 2 oAccess = win32com.client.Dispatch(\'Access.Application\')
  3. 3 DbFile = r\'C:\Users\hans\Documents\NewDb.mdb\'
  4. 4 dbLangGeneral = \';LANGID=0x0409;CP=1252;COUNTRY=0\'
  5. 5 # dbVersion40 64
  6. 6 dbVersion = 64
  7. 7 oAccess.DBEngine.CreateDatabase(DbFile, dbLangGeneral, dbVersion)
  8. 8 oAccess.Quit()
  9. 9 del oAccess

View Code

4. 模拟浏览器行为:http://www.cnblogs.com/chenzehe/archive/2010/09/01/1814397.html

  1. 1 Code highlighting produced by Actipro CodeHighlighter (freeware)http://www.CodeHighlighter.com/--># -*- coding:UTF-8 -*-
  2. 2 #!/user/bin/env python
  3. 3 \'\'\'
  4. 4 Created on 2010-9-1
  5. 5 @author: chenzehe
  6. 6 \'\'\'
  7. 7 import win32com.client
  8. 8 from time import sleep
  9. 9
  10. 10 loginurl=\'http://passport.cnblogs.com/login.aspx\'
  11. 11 loginouturl=\'http://passport.cnblogs.com/logout.aspx\'
  12. 12 username=\'XXX\'
  13. 13 password=\'XXX\'
  14. 14
  15. 15 ie = win32com.client.Dispatch("InternetExplorer.Application")
  16. 16 ie.Visible = 0
  17. 17
  18. 18
  19. 19 ie.Navigate(loginurl)
  20. 20 state = ie.ReadyState
  21. 21 print "打开登陆页面"
  22. 22 while 1:
  23. 23 state = ie.ReadyState
  24. 24 if state ==4:
  25. 25 break
  26. 26 sleep(1)
  27. 27 print "页面载入完毕,输入用户名密码"
  28. 28 state = None
  29. 29
  30. 30 ie.Document.getElementById("tbUserName").value=username
  31. 31 ie.Document.getElementById("tbPassword").value=password
  32. 32 ie.Document.getElementById("btnLogin").click()
  33. 33
  34. 34
  35. 35 while 1:
  36. 36 state = ie.ReadyState
  37. 37 print state
  38. 38 if state ==4 and str(ie.LocationURL) == "http://home.cnblogs.com/":
  39. 39 break
  40. 40 sleep(1)
  41. 41 print "登陆成功"
  42. 42 print \'你的昵称是:\'
  43. 43 print ie.Document.getElementById(\'lnk_current_user\').title
  44. 44
  45. 45 #博客园只能登录一次,注销
  46. 46 print \'注销!\'
  47. 47 ie.Navigate(loginouturl)

View Code

5.播放mp3文件:http://www.sharejs.com/codes/python/5733

  1. 1 from win32com.client import Dispatch
  2. 2 mp = Dispatch("WMPlayer.OCX")
  3. 3 # use an mp3 file you have ...
  4. 4 #tune = mp.newMedia("C:/Program Files/Common Files/HP/Memories Disc/2.0/audio/Swing.mp3")
  5. 5 # or copy one to the working folder ...
  6. 6 #tune = mp.newMedia("Bier1.mp3")
  7. 7 # you can also play wma files, this cool sound came with XP ...
  8. 8 tune = mp.newMedia("C:/WINDOWS/system32/oobe/images/title.wma")
  9. 9 mp.currentPlaylist.appendItem(tune)
  10. 10 mp.controls.play()
  11. 11 # to stop playing use
  12. 12 raw_input("Press Enter to stop playing")
  13. 13 mp.controls.stop()

View Code

  真心感觉这个东西很强大呀!言归正传,该上将word和ppt转化为txt的代码了!如下:

  1. 1 #coding:utf-8
  2. 2 import win32com
  3. 3 import win32con
  4. 4 import win32gui
  5. 5 import codecs
  6. 6 from win32com.client import Dispatch
  7. 7 import pythoncom
  8. 8
  9. 9 class MSOffice2txt():
  10. 10 def __init__(self, fileType=[\'doc\',\'ppt\']):
  11. 11 self.docCom = None
  12. 12 self.pptCom = None
  13. 13 pythoncom.CoInitialize()
  14. 14 if type(fileType) is not list:
  15. 15 return \'Error, please check the fileType, it must be list[]\'
  16. 16 for ft in fileType:
  17. 17 if ft == \'doc\':
  18. 18 self.docCom = self.docApplicationOpen()
  19. 19 elif ft == \'ppt\':
  20. 20 self.pptCom = self.pptApplicationOpen()
  21. 21
  22. 22 def close(self):
  23. 23 self.docApplicationClose(self.docCom)
  24. 24 self.pptApplicationClose(self.pptCom)
  25. 25
  26. 26 def docApplicationOpen(self):
  27. 27 docCom = win32com.client.Dispatch(\'Word.Application\')
  28. 28 docCom.Visible = 1
  29. 29 docCom.DisplayAlerts = 0
  30. 30 docHwnd = win32gui.FindWindow(None, \'Microsoft Word\')
  31. 31 win32gui.ShowWindow(docHwnd, win32con.SW_HIDE)
  32. 32 return docCom
  33. 33
  34. 34 def docApplicationClose(self,docCom):
  35. 35 if docCom is not None:
  36. 36 docCom.Quit()
  37. 37
  38. 38 def doc2Txt(self, docCom, docFile, txtFile):
  39. 39 doc = docCom.Documents.Open(FileName=docFile,ReadOnly=1)
  40. 40 doc.SaveAs(txtFile, 2)
  41. 41 doc.Close()
  42. 42
  43. 43
  44. 44
  45. 45 def pptApplicationOpen(self):
  46. 46 pptCom = win32com.client.Dispatch(\'PowerPoint.Application\')
  47. 47 pptCom.Visible = 1
  48. 48 pptCom.DisplayAlerts = 0
  49. 49 pptHwnd = win32gui.FindWindow(None, \'Microsoft PowerPoint\')
  50. 50 win32gui.ShowWindow(pptHwnd, win32con.SW_HIDE)
  51. 51 return pptCom
  52. 52
  53. 53 def pptApplicationClose(self, pptCom):
  54. 54 if pptCom is not None:
  55. 55 pptCom.Quit()
  56. 56
  57. 57 def ppt2txt(self, pptCom, pptFile, txtFile):
  58. 58 ppt = pptCom.Presentations.Open(pptFile,ReadOnly=1, Untitled=0, WithWindow=0)
  59. 59 f = codecs.open(txtFile,"w",\'gb18030\')
  60. 60 slide_count = ppt.Slides.Count
  61. 61 for i in xrange(1,slide_count + 1):
  62. 62 shape_count = ppt.Slides(i).Shapes.Count
  63. 63 for j in xrange(1,shape_count + 1):
  64. 64 if ppt.Slides(i).Shapes(j).HasTextFrame:
  65. 65 s = ppt.Slides(i).Shapes(j).TextFrame.TextRange.Text
  66. 66 f.write(s)
  67. 67 f.close()
  68. 68 ppt.Close()
  69. 69
  70. 70 def translate(self, filename, txtFilename):
  71. 71 if filename.endswith(\'doc\') or filename.endswith(\'docx\'):
  72. 72 if self.docCom is None:
  73. 73 self.docCom = self.docApplicationOpen()
  74. 74 self.doc2Txt(self.docCom, filename, txtFilename)
  75. 75 return True
  76. 76 elif filename.endswith(\'ppt\') or filename.endswith(\'pptx\'):
  77. 77 if self.pptCom is None:
  78. 78 self.pptCom = self.pptApplicationOpen()
  79. 79 self.ppt2txt(self.pptCom, filename, txtFilename)
  80. 80 return True
  81. 81 else:
  82. 82 return False
  83. 83
  84. 84 if __name__==\'__main__\':
  85. 85 msoffice = MSOffice2txt()
  86. 86 filename = u\'F:\\study.docx\'
  87. 87 if msoffice.translate(filename, \'temp.txt\'):
  88. 88 print \'Successed!\'
  89. 89 else:
  90. 90 print \'Failed!\'
  91. 91 msoffice.close()

 

   

版权声明:本文为AlgorithmDot原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://www.cnblogs.com/algorithmdot/p/3386918.html