python读取word中的段落、表、图+++++++++++Doc转换Docx

读取文本、图、表、解压信息


import docx
import zipfile
import os
import shutil

\'\'\'读取word中的文本\'\'\'
def gettxt():
    file=docx.Document("gao.docx")
    print("段落数:"+str(len(file.paragraphs)))#段落数为13，每个回车隔离一段
    
    #输出每一段的内容
    # for para in file.paragraphs:
    #     print(para.text)
    
    #输出段落编号及段落内容
    for i in range(len(file.paragraphs)):
        if len(file.paragraphs[i].text.replace(\' \',\'\'))>4:
            print("第"+str(i)+"段的内容是："+file.paragraphs[i].text)


            
\'\'\'读取word中的table\'\'\' 
def gettable():
    doc = docx.Document(\'word.docx\')
    for table in doc.tables:  # 遍历所有表格
        print (\'----table------\')
        for row in table.rows:  # 遍历表格的所有行
            # row_str = \'\t\'.join([cell.text for cell in row.cells])  # 一行数据
            # print row_str
            for cell in row.cells:
                print (cell.text, \'\t\')

\'\'\'获取解压后的文件信息\'\'\'
def getinfo(wordfile):
    f=zipfile.ZipFile(wordfile,\'r\')
    for filename in f.namelist():
        f.extract(filename)
        print(filename)
        
\'\'\'
输出解压后的信息：
_rels/
_rels/.rels
customXml/
customXml/_rels/
customXml/_rels/item1.xml.rels
customXml/_rels/item2.xml.rels
customXml/item1.xml
customXml/item2.xml
customXml/itemProps1.xml
customXml/itemProps2.xml
docProps/
docProps/app.xml
docProps/core.xml
docProps/custom.xml
docProps/thumbnail.wmf
word/
word/_rels/
word/_rels/document.xml.rels
word/document.xml
word/fontTable.xml
word/media/
word/media/image1.jpeg
word/numbering.xml
word/settings.xml
word/styles.xml
word/theme/
word/theme/theme1.xml
\'\'\'





\'\'\'
------获取图：
word文档的路径
zip压缩文件的路径
临时解压的tmp路径
最后需要保存的store_path路径
\'\'\'
def getpic(path, zip_path, tmp_path, store_path):
    \'\'\'
    :param path:源文件
    :param zip_path:docx重命名为zip
    :param tmp_path:中转图片文件夹
    :param store_path:最后保存结果的文件夹（需要手动创建）
    :return:
    \'\'\'
    \'\'\'=============将docx文件重命名为zip文件====================\'\'\'
    os.rename(path, zip_path)
    # 进行解压
    f = zipfile.ZipFile(zip_path, \'r\')
    # 将图片提取并保存
    for file in f.namelist():
        f.extract(file, tmp_path)
    # 释放该zip文件
    f.close()
    \'\'\'=============将docx文件从zip还原为docx====================\'\'\'
    os.rename(zip_path, path)
    # 得到缓存文件夹中图片列表
    pic = os.listdir(os.path.join(tmp_path, \'word/media\'))
    \'\'\'=============将图片复制到最终的文件夹中====================\'\'\'
    for i in pic:
        # 根据word的路径生成图片的名称
        new_name = path.replace(\'\\\', \'_\')
        new_name = new_name.replace(\':\', \'\') + \'_\' + i
        shutil.copy(os.path.join(tmp_path + \'/word/media\', i), os.path.join(store_path, new_name))
    \'\'\'=============删除缓冲文件夹中的文件，用以存储下一次的文件====================\'\'\'
    for i in os.listdir(tmp_path):
        # 如果是文件夹则删除
        if os.path.isdir(os.path.join(tmp_path, i)):
            shutil.rmtree(os.path.join(tmp_path, i))

    
            

if __name__ == \'__main__\':
    # 源文件
    path = r\'E:\dogcat\提取图片\log.docx\'
    # docx重命名为zip
    zip_path = r\'E:\dogcat\提取图片\log.zip\'
    # 中转图片文件夹
    tmp_path = r\'E:\dogcat\提取图片\tmp\'
    # 最后保存结果的文件夹
    store_path = r\'E:\dogcat\提取图片\测试\'
    m = getpic(path, zip_path, tmp_path, store_path)

至于处理doc文件直接转存成docx文件就可以了

def docTTTTTdocx(doc_name, docx_name):    
　　try:
        # 首先将doc转换成docx
        word = client.Dispatch("Word.Application")
        doc = word.Documents.Open(doc_name)
        # 使用参数16表示将doc转换成docx
        doc.SaveAs(docx_name, 16)
        doc.Close()
        word.Quit()
    except:
        pass


这里如果转换不成功，可能是路径的问题，把doc_name换成完整路径，如下：

from win32com.client import Dispatch

def docToDocxR(docPath, docxPath):
    \'\'\'将doc转存为docx\'\'\'
    word = Dispatch(\'Word.Application\')
    pathPrefix = sys.path[0]+\'\\\'
    print(pathPrefix)
    doc = word.Documents.Open(pathPrefix+docPath)
    doc.SaveAs(pathPrefix+docxPath, FileFormat=12)
    doc.Close()
    word.Quit()

参考:

https://blog.csdn.net/qq_40925239/article/details/83279957

https://blog.csdn.net/qq_15969343/article/details/81673970

本文链接：https://www.cnblogs.com/51python/p/11033002.html

python读取word中的段落、表、图+++++++++++Doc转换Docx

python读取word中的段落、表、图+++++++++++Doc转换Docx的更多相关文章

随机推荐

热门专题

目录导航