1 from pdfminer.converter import PDFPageAggregator 2 from pdfminer.layout import LAParams 3 from pdfminer.pdfparser import PDFParser 4 from pdfminer.pdfparser import PDFDocument 5 from pdfminer.pdfinterp import PDFResourceManager 6 from pdfminer.pdfinterp import PDFPageInterpreter 7 from pdfminer.pdfdevice import PDFDevice 8 9 # 获取文档对象10 fp = open("banReport.pdf","rb")11 12 # 创建一个PDF文档解释器13 parser = PDFParser(fp)14 15 # PDF文档的对象16 doc = PDFDocument()17 18 #连接解释器和文档对象19 parser.set_document(doc)20 doc.set_parser(parser)21 22 # 初始化文档23 doc.initialize('')24 25 # 创建PDF资源管理器26 resource = PDFResourceManager()27 28 #参数分析器29 laparam = LAParams()30 31 # 创建一个聚合器32 device = PDFPageAggregator(resource,laparams=laparam)33 # 创建PDF页面解释器34 interpreter = PDFPageInterpreter(resource,device=device)35 36 # 使用文档对象得到页面的集合37 for page in doc.get_pages():38 # 使用页面解释器来读取39 interpreter.process_page(page)40 41 # 使用聚合器来获取内容42 layout = device.get_result()43 44 for out in layout:45 if hasattr(out,"get_text"):46 print(out.get_text())