PDF 转 PPTX
用途
- beamer 生成的 PDF 转 PPTX
- marp 导出的 PDF 转 PPTX
- PPTX 导出的 PDF 再转为高清图片档 PPTX
过程
- 将 PDF 文档导出为 PNG 图片,每页一张。
- 每张 PNG 作为一张 slide,做成一个 PPTX 文档。
安装
需安装
-
python-pptx,
pip install python-pptx
-
pymupdf,
pip install pymupdf
使用
用法:python pdf_to_pptx.py test.pdf {RATIO} {ZOOM-FACTOR}
,
其中 {RATIO}
表示 PDF 页面的宽高比例,{ZOOM-FACTOR}
控制图片清晰度。
如
python pdf_to_pptx.py test.pdf 4:3 5
或
python pdf_to_pptx.py test.pdf 16:9 10
代码
pdf_to_pptx.py
# encoding: UTF-8
# pip install python-pptx
# https://python-pptx.readthedocs.io/
# pip install pymupdf
# https://pymupdf.readthedocs.io/en/latest/
import sys, math, io
from pptx import Presentation
from pptx.util import Inches
from pptx.parts.image import Image
import fitz
# args: pdf_filename, aspect_ratio, zoom_factor
pdf_filename = sys.argv[1]
width, height = list(map(int, sys.argv[2].split(\':\')))
zoom_factor = int(sys.argv[3])
prs = Presentation()
prs.slide_width = Inches(width)
prs.slide_height = Inches(height)
blank_slide_layout = prs.slide_layouts[6]
mat = fitz.Matrix(zoom_factor, zoom_factor) # zoom factor in each dimension
doc = fitz.open(pdf_filename) # open document
print(\'#pages\', len(doc))
for i, page in enumerate(doc): # iterate through the pages
pix = page.getPixmap(matrix = mat, alpha=False) # use \'mat\' instead of the identity matrix
png_data = pix.getImageData(\'png\')
tmp_file = io.BytesIO(png_data)
slide = prs.slides.add_slide(blank_slide_layout)
pic = slide.shapes.add_picture(tmp_file, 0, 0, width=prs.slide_width)
print(\'page {}, image size: ({}, {})\'.format(i+1, pix.width, pix.height))
pptx_filename = pdf_filename.replace(\'.pdf\', \'_converted.pptx\')
prs.save(pptx_filename)
print(\'saved to\', pptx_filename)
旧版本
安装及使用
需安装
-
python-pptx,
pip install python-pptx
- pdftk
- pdftoppm
用法:python pdf_to_pptx.py test.pdf {RATIO} {DPI}
,
其中 {RATIO}
表示 PDF 页面的宽高比例,{DPI}
表示图片 DPI,控制图片清晰度。
如
python pdf_to_pptx.py test.pdf 4:3 500
或
python pdf_to_pptx.py test.pdf 16:9 1000
代码
pdf_to_pptx.py
# encoding: UTF-8
# pip install python-pptx
# https://python-pptx.readthedocs.io/
# need pdftk, pdftoppm installed
import sys
import os
import re
import math
from pptx import Presentation
from pptx.util import Inches
from pptx.parts.image import Image
# args: pdf_filename, aspect_ratio, dpi
pdf_filename = sys.argv[1]
width, height = list(map(int, sys.argv[2].split(\':\')))
dpi = int(sys.argv[3])
tmp_dir = \'png_tmps\'
os.system(\'mkdir -p {}\'.format(tmp_dir))
# dump pdf meta-data to get number of pages
os.system(\'pdftk {} dump_data_utf8 output {}/data.txt\'.format(pdf_filename, tmp_dir))
data_content = open(\'{}/data.txt\'.format(tmp_dir)).read()
groups = re.findall(\'NumberOfPages: (\d+)\', data_content)
page_num = int(groups[0])
print(\'#pages:\', page_num)
prs = Presentation()
prs.slide_width = Inches(width)
prs.slide_height = Inches(height)
# _ = math.gcd(prs.slide_width, prs.slide_height)
# print(\'PPT Aspect Ratio: {}/{}\'.format(prs.slide_width//_, prs.slide_height//_))
blank_slide_layout = prs.slide_layouts[6]
for i in range(1, page_num+1):
os.system(\'pdftoppm {0} {1}/{2} -png -f {2} -singlefile -r {3}\'.format(pdf_filename, tmp_dir, i, dpi))
img_path = \'{}/{}.png\'.format(tmp_dir, i)
img = Image.from_file(img_path)
slide = prs.slides.add_slide(blank_slide_layout)
pic = slide.shapes.add_picture(img_path, 0, 0, width=prs.slide_width)
img_width, img_height = img.size
_ = math.gcd(img_width, img_height)
print(i, img.size, \'{}:{}\'.format(img_width//_, img_height//_))
pptx_filename = pdf_filename.replace(\'.pdf\', \'_converted.pptx\')
prs.save(pptx_filename)
print(\'saved to\', pptx_filename)
END
2020.5.10 晚 22:13
2020.6.14 晚 21:08 添加新版本