Platinum Partner
python,architects,bigdata,languages,tips and tricks

Scraping PDF Text with Python

This example will walk a directory structure, look for PDFs, and make a “.txt” file next to the PDF with a text rendition.

import sys
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice, TagExtractor
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams
import os
 
# main
def main(outfile, fname):
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outtype = 'text'
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
 
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
 
    rsrcmgr = PDFResourceManager(caching=caching)
    outtype = 'text'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
 
    fp = file(fname, 'rb')
    process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages,
                password=password, caching=caching,
                check_extractable=True)
    fp.close()
    device.close()
    outfp.close()
    return
 
indir = "00"
for root, dirs, filenames in os.walk(indir):
   for fname in filenames:
       if (fname.endswith(".pdf")):
            main(os.path.join(root, fname) + ".txt", os.path.join(root, fname))


Published at DZone with permission of {{ articles[0].authors[0].realName }}, DZone MVB. (source)

Opinions expressed by DZone contributors are their own.

{{ tag }}, {{tag}},

{{ parent.title || parent.header.title}}

{{ parent.tldr }}

{{ parent.urlSource.name }}
{{ parent.authors[0].realName || parent.author}}

{{ parent.authors[0].tagline || parent.tagline }}

{{ parent.views }} ViewsClicks
Tweet

{{parent.nComments}}