Over a million developers have joined DZone.

Scraping PDF Text with Python

· Web Dev Zone

Start coding today to experience the powerful engine that drives data application’s development, brought to you in partnership with Qlik.

This example will walk a directory structure, look for PDFs, and make a “.txt” file next to the PDF with a text rendition.

import sys
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
import os
 
# main
def main(outfile, fname):
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outtype = 'text'
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
 
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
 
    rsrcmgr = PDFResourceManager(caching=caching)
    outtype = 'text'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
 
    fp = file(fname, 'rb')
    process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages,
                password=password, caching=caching,
                check_extractable=True)
    fp.close()
    device.close()
    outfp.close()
    return
 
indir = "00"
for root, dirs, filenames in os.walk(indir):
   for fname in filenames:
       if (fname.endswith(".pdf")):
            main(os.path.join(root, fname) + ".txt", os.path.join(root, fname))


Create data driven applications in Qlik’s free and easy to use coding environment, brought to you in partnership with Qlik.

Topics:

Published at DZone with permission of Gary Sieling, DZone MVB. See the original article here.

Opinions expressed by DZone contributors are their own.

The best of DZone straight to your inbox.

SEE AN EXAMPLE
Please provide a valid email address.

Thanks for subscribing!

Awesome! Check your inbox to verify your email so you can start receiving the latest in tech news and resources.
Subscribe

{{ parent.title || parent.header.title}}

{{ parent.tldr }}

{{ parent.urlSource.name }}