你不可能用眼和手親自解析上成千上萬筆 pdf 資料, 我們應該開發自動工具來處理.
from PyPDF2 import PdfFileReader
def get_info(path):
with open(path, 'rb') as f:
pdf = PdfFileReader(f)
info = pdf.getDocumentInfo()
number_of_pages = pdf.getNumPages()
print(info)
author = info.author
creator = info.creator
producer = info.producer
subject = info.subject
title = info.title
def text_extractor(path):
with open(path, 'rb') as f:
pdf = PdfFileReader(f)
# get the first page
page = pdf.getPage(16)
print(page)
print('Page type: {}'.format(str(type(page))))
text = page.extractText()
print(text)
if __name__ == '__main__':
path = 'sample.pdf'
get_info(path)
text_extractor(path)
Reference
https://www.blog.pythonlibrary.org/2018/06/07/an-intro-to-pypdf2/