2019年2月19日 星期二

[python] 讀取 PDF 文件, 開發自動化工具

你不可能用眼和手親自解析上成千上萬筆 pdf 資料, 我們應該開發自動工具來處理.


from PyPDF2 import PdfFileReader


def get_info(path):
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)
        info = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()

    print(info)

    author = info.author
    creator = info.creator
    producer = info.producer
    subject = info.subject
    title = info.title

def text_extractor(path):
    with open(path, 'rb') as f:
        pdf = PdfFileReader(f)

        # get the first page
        page = pdf.getPage(16)
        print(page)
        print('Page type: {}'.format(str(type(page))))

        text = page.extractText()
        print(text)

if __name__ == '__main__':
    path = 'sample.pdf'
    get_info(path)
    text_extractor(path)


Reference
https://www.blog.pythonlibrary.org/2018/06/07/an-intro-to-pypdf2/