多线程pdf文件列表时“不支持文件格式” - python

我在自学python线程的基础知识,并陷入困境。
我希望脚本将功能应用于pdf列表。简单地说,此函数应该计算每个pdf文件中的表格数量,然后返回每个文件有多少张表格的组合列表。

现在,我收到一条错误消息,指出“我的文件格式不受支持”。据我所知,列表中的每个路径都是以.pdf结尾的完整路径。我不知道我在做什么错?

我已将代码简化为要点,并在下面包含了我的代码

import camelot
from multiprocessing.dummy import Pool as ThreadPool 
import glob
import os


#get a list of all the pdf paths in the directory I am interested in
pdfs = [os.path.abspath(x) for x in os.listdir(r'C:\Users\josiahh\Desktop\threading_learning')]

#format each path to have the r letter in front of it
rpdfs = ["r'" + pdf + "'" for pdf in pdfs]

#function that counts each table in the pdf. THIS IS WHERE SOMETHING IS WRONG...I THINK
listoflengths = []
def len_table5(filepath):    
    tables = camelot.read_pdf(filepath, pages = '1-end',flavor='stream')
    tablelength = len(tables)
    listoflengths.append(tablelength)

#threading code
pool = ThreadPool(5) 
results = pool.map(len_table5, rpdfs)
pool.close() 
pool.join() 

任何帮助将不胜感激。请让我知道是否可以澄清任何事情

编辑:
在文件名前使用r时的回溯

---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
<ipython-input-57-d38bdf75c567> in <module>
      1 
      2 pool = ThreadPool(5)
----> 3 results = pool.map(len_table5, rpdfs)
      4 pool.close()
      5 pool.join()

~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in map(self, func, iterable, chunksize)
    286         in a list that is returned.
    287         '''
--> 288         return self._map_async(func, iterable, mapstar, chunksize).get()
    289 
    290     def starmap(self, func, iterable, chunksize=None):

~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in get(self, timeout)
    668             return self._value
    669         else:
--> 670             raise self._value
    671 
    672     def _set(self, i, obj):

~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
    117         job, i, func, args, kwds = task
    118         try:
--> 119             result = (True, func(*args, **kwds))
    120         except Exception as e:
    121             if wrap_exception and func is not _helper_reraises_exception:

~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in mapstar(args)
     42 
     43 def mapstar(args):
---> 44     return list(map(*args))
     45 
     46 def starmapstar(args):

<ipython-input-54-025080eb0d6f> in len_table5(filepath)
      1 listoflengths = []
      2 def len_table5(filepath):
----> 3     tables = camelot.read_pdf(filepath, pages = '1-end',flavor='stream')
      4     tablelength = len(tables)
      5     listoflengths.append(tablelength)

~\AppData\Local\Continuum\anaconda3\lib\site-packages\camelot\io.py in read_pdf(filepath, pages, password, flavor, suppress_stdout, layout_kwargs, **kwargs)
    101 
    102         validate_input(kwargs, flavor=flavor)
--> 103         p = PDFHandler(filepath, pages=pages, password=password)
    104         kwargs = remove_extra(kwargs, flavor=flavor)
    105         tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,

~\AppData\Local\Continuum\anaconda3\lib\site-packages\camelot\handlers.py in __init__(self, filepath, pages, password)
     33         self.filepath = filepath
     34         if not filepath.lower().endswith('.pdf'):
---> 35             raise NotImplementedError("File format not supported")
     36 
     37         if password is None:

NotImplementedError: File format not supported

在文件路径中不使用rs时的回溯

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-59-07744a46a83f> in <module>
      1 
      2 pool = ThreadPool(5)
----> 3 results = pool.map(len_table5, pdfs)
      4 pool.close()
      5 pool.join()

~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in map(self, func, iterable, chunksize)
    286         in a list that is returned.
    287         '''
--> 288         return self._map_async(func, iterable, mapstar, chunksize).get()
    289 
    290     def starmap(self, func, iterable, chunksize=None):

~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in get(self, timeout)
    668             return self._value
    669         else:
--> 670             raise self._value
    671 
    672     def _set(self, i, obj):

~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
    117         job, i, func, args, kwds = task
    118         try:
--> 119             result = (True, func(*args, **kwds))
    120         except Exception as e:
    121             if wrap_exception and func is not _helper_reraises_exception:

~\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py in mapstar(args)
     42 
     43 def mapstar(args):
---> 44     return list(map(*args))
     45 
     46 def starmapstar(args):

<ipython-input-58-e6499958826d> in len_table5(filepath)
     14 listoflengths = []
     15 def len_table5(filepath):
---> 16     tables = camelot.read_pdf(filepath, pages = '1-end',flavor='stream')
     17     tablelength = len(tables)
     18     listoflengths.append(tablelength)

~\AppData\Local\Continuum\anaconda3\lib\site-packages\camelot\io.py in read_pdf(filepath, pages, password, flavor, suppress_stdout, layout_kwargs, **kwargs)
    101 
    102         validate_input(kwargs, flavor=flavor)
--> 103         p = PDFHandler(filepath, pages=pages, password=password)
    104         kwargs = remove_extra(kwargs, flavor=flavor)
    105         tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout,

~\AppData\Local\Continuum\anaconda3\lib\site-packages\camelot\handlers.py in __init__(self, filepath, pages, password)
     41             if sys.version_info[0] < 3:
     42                 self.password = self.password.encode('ascii')
---> 43         self.pages = self._get_pages(self.filepath, pages)
     44 
     45     def _get_pages(self, filepath, pages):

~\AppData\Local\Continuum\anaconda3\lib\site-packages\camelot\handlers.py in _get_pages(self, filepath, pages)
     64             page_numbers.append({'start': 1, 'end': 1})
     65         else:
---> 66             infile = PdfFileReader(open(filepath, 'rb'), strict=False)
     67             if infile.isEncrypted:
     68                 infile.decrypt(self.password)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\josiahh\\camelot - Copy (2) - Copy.pdf'

参考方案

有一些事情要考虑:

不要硬编码路径字符串,它不灵活并且很可能
将无法在其他计算机上使用。
os.listdir()只是返回文件夹中的文件名,因此os.path.abspath()不会给您正确的结果。
不确定要在文件名中添加前缀,您真的需要吗?

更正的版本将是:

import os
from multiprocessing.dummy import Pool as ThreadPool

import camelot

BASE_PATH = os.path.dirname((os.path.abspath(__file__)))

FOLDER_PATH = os.path.join(BASE_PATH, "threading_learning")

pdfs = [os.path.join(FOLDER_PATH, file_name) for file_name in os.listdir(FOLDER_PATH)]

listoflengths = []


def len_table5(filepath):
    tables = camelot.read_pdf(filepath, pages='1-end', flavor='stream')
    tablelength = len(tables)
    listoflengths.append(tablelength)


# threading code
pool = ThreadPool(5)
results = pool.map(len_table5, pdfs)
pool.close()
pool.join()

print(listoflengths)

Python pytz时区函数返回的时区为9分钟 - python

由于某些原因,我无法从以下代码中找出原因:>>> from pytz import timezone >>> timezone('America/Chicago') 我得到:<DstTzInfo 'America/Chicago' LMT-1 day, 18:09:00 STD…

在返回'Response'(Python)中传递多个参数 - python

我在Angular工作,正在使用Http请求和响应。是否可以在“响应”中发送多个参数。角度文件:this.http.get("api/agent/applicationaware").subscribe((data:any)... python文件:def get(request): ... return Response(seriali…

Python exchangelib在子文件夹中读取邮件 - python

我想从Outlook邮箱的子文件夹中读取邮件。Inbox ├──myfolder 我可以使用account.inbox.all()阅读收件箱,但我想阅读myfolder中的邮件我尝试了此页面folder部分中的内容,但无法正确完成https://pypi.python.org/pypi/exchangelib/ 参考方案 您需要首先掌握Folder的myfo…

R'relaimpo'软件包的Python端口 - python

我需要计算Lindeman-Merenda-Gold(LMG)分数,以进行回归分析。我发现R语言的relaimpo包下有该文件。不幸的是,我对R没有任何经验。我检查了互联网,但找不到。这个程序包有python端口吗?如果不存在,是否可以通过python使用该包? python参考方案 最近,我遇到了pingouin库。

如何用'-'解析字符串到节点js本地脚本? - python

我正在使用本地节点js脚本来处理字符串。我陷入了将'-'字符串解析为本地节点js脚本的问题。render.js:#! /usr/bin/env -S node -r esm let argv = require('yargs') .usage('$0 [string]') .argv; console.log(argv…