运行 python 脚本时出现问题(pypdf/hex 错误)
我正在尝试使用 PyPDF 模块创建一个 Python 脚本。该脚本的作用是获取“Root”文件夹,合并其中的所有 PDF,并将合并的 PDF 输出到“Output”文件夹中,并将其重命名为“Root.pdf”(包含拆分 PDF 的文件夹)。然后它对子目录执行相同的操作,为最终输出提供与子目录相同的名称。
在处理子目录时我陷入困境,给我一个与某些十六进制值相关的错误代码。 (看起来它正在获取一个不是十六进制的空值)
这是生成的错误代码:
Traceback (most recent call last):
File "C:\Documents and Settings\student3\Desktop\Test\pdfMergerV1.py", line 76, in <module>
files_recursively(path)
File "C:\Documents and Settings\student3\Desktop\Test\pdfMergerV1.py", line 74, in files_recursively
os.path.walk(path, process_file, ())
File "C:\Python27\lib\ntpath.py", line 263, in walk
walk(name, func, arg)
File "C:\Python27\lib\ntpath.py", line 259, in walk
func(arg, top, names)
File "C:\Documents and Settings\student3\Desktop\Test\pdfMergerV1.py", line 38, in process_file
pdf = PdfFileReader(file( filename, "rb"))
File "C:\Python27\lib\site-packages\pyPdf\pdf.py", line 374, in __init__
self.read(stream)
File "C:\Python27\lib\site-packages\pyPdf\pdf.py", line 775, in read
newTrailer = readObject(stream, self)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 67, in readObject
return DictionaryObject.readFromStream(stream, pdf)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 531, in readFromStream
value = readObject(stream, pdf)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 58, in readObject
return ArrayObject.readFromStream(stream, pdf)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 153, in readFromStream
arr.append(readObject(stream, pdf))
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 69, in readObject
return readHexStringFromStream(stream)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 276, in readHexStringFromStream
txt += chr(int(x, base=16))
ValueError: invalid literal for int() with base 16: '\x00\x00'
这是脚本的源代码:
#----------------------------------------------------------------------------------------------
# Name: pdfMerger
# Purpose: Automatic merging of all PDF files in a directory and its sub-directories and
# rename them according to the folder itself. Requires the pyPDF Module
#
# Current: Processes all the PDF files in the current directory
# To-Do: Process the sub-directories.
#
# Version: 1.0
# Author: Brian Livori
#
# Created: 03/08/2011
# Copyright: (c) Brian Livori 2011
# Licence: Open-Source
#---------------------------------------------------------------------------------------------
#!/usr/bin/env <strong class="highlight">python</strong>
import os
import glob
import sys
import fnmatch
from pyPdf import PdfFileReader, PdfFileWriter
output = PdfFileWriter()
path = str(os.getcwd())
x = 0
def process_file(_, path, filelist):
for filename in filelist:
if filename.endswith('.pdf'):
filename = os.path.join(path, filename)
print "Merging " + filename
pdf = PdfFileReader(file( filename, "rb"))
x = pdf.getNumPages()
i = 0
while (i != x):
output.addPage(pdf.getPage(i))
print "Merging page: " + str(i+1) + "/" + str(x)
i += 1
output_dir = "\Output\\"
ext = ".pdf"
dir = os.path.basename(path)
outputpath = str(os.getcwd()) + output_dir
final_output = outputpath
if os.path.exists(final_output) != True:
os.mkdir(final_output)
outputStream = file(final_output + dir + ext, "wb")
os.path.join(outputStream)
output.write(outputStream)
outputStream.close()
else:
outputStream = file(final_output + dir + ext, "wb")
os.path.join(outputStream)
output.write(outputStream)
outputStream.close()
def files_recursively(topdir):
os.path.walk(path, process_file, ())
files_recursively(path)
I am trying to create a Python script using the PyPDF Module. What the script does it take the 'Root' folder, merges all the PDFs in it and outputs the merged PDF in an 'Output' folder and renames it to 'Root.pdf' (the folder which containes the split PDFs). What it does then is do the same with the sub-directories, giving the final output a name equal to the sub-directories.
I'm stuck when coming to process the sub-directories, giving me an error code related to some hex values. (it seems that it is getting a null value which is not in hex)
Here is the error code generated:
Traceback (most recent call last):
File "C:\Documents and Settings\student3\Desktop\Test\pdfMergerV1.py", line 76, in <module>
files_recursively(path)
File "C:\Documents and Settings\student3\Desktop\Test\pdfMergerV1.py", line 74, in files_recursively
os.path.walk(path, process_file, ())
File "C:\Python27\lib\ntpath.py", line 263, in walk
walk(name, func, arg)
File "C:\Python27\lib\ntpath.py", line 259, in walk
func(arg, top, names)
File "C:\Documents and Settings\student3\Desktop\Test\pdfMergerV1.py", line 38, in process_file
pdf = PdfFileReader(file( filename, "rb"))
File "C:\Python27\lib\site-packages\pyPdf\pdf.py", line 374, in __init__
self.read(stream)
File "C:\Python27\lib\site-packages\pyPdf\pdf.py", line 775, in read
newTrailer = readObject(stream, self)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 67, in readObject
return DictionaryObject.readFromStream(stream, pdf)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 531, in readFromStream
value = readObject(stream, pdf)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 58, in readObject
return ArrayObject.readFromStream(stream, pdf)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 153, in readFromStream
arr.append(readObject(stream, pdf))
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 69, in readObject
return readHexStringFromStream(stream)
File "C:\Python27\lib\site-packages\pyPdf\generic.py", line 276, in readHexStringFromStream
txt += chr(int(x, base=16))
ValueError: invalid literal for int() with base 16: '\x00\x00'
This is the source code for the script:
#----------------------------------------------------------------------------------------------
# Name: pdfMerger
# Purpose: Automatic merging of all PDF files in a directory and its sub-directories and
# rename them according to the folder itself. Requires the pyPDF Module
#
# Current: Processes all the PDF files in the current directory
# To-Do: Process the sub-directories.
#
# Version: 1.0
# Author: Brian Livori
#
# Created: 03/08/2011
# Copyright: (c) Brian Livori 2011
# Licence: Open-Source
#---------------------------------------------------------------------------------------------
#!/usr/bin/env <strong class="highlight">python</strong>
import os
import glob
import sys
import fnmatch
from pyPdf import PdfFileReader, PdfFileWriter
output = PdfFileWriter()
path = str(os.getcwd())
x = 0
def process_file(_, path, filelist):
for filename in filelist:
if filename.endswith('.pdf'):
filename = os.path.join(path, filename)
print "Merging " + filename
pdf = PdfFileReader(file( filename, "rb"))
x = pdf.getNumPages()
i = 0
while (i != x):
output.addPage(pdf.getPage(i))
print "Merging page: " + str(i+1) + "/" + str(x)
i += 1
output_dir = "\Output\\"
ext = ".pdf"
dir = os.path.basename(path)
outputpath = str(os.getcwd()) + output_dir
final_output = outputpath
if os.path.exists(final_output) != True:
os.mkdir(final_output)
outputStream = file(final_output + dir + ext, "wb")
os.path.join(outputStream)
output.write(outputStream)
outputStream.close()
else:
outputStream = file(final_output + dir + ext, "wb")
os.path.join(outputStream)
output.write(outputStream)
outputStream.close()
def files_recursively(topdir):
os.path.walk(path, process_file, ())
files_recursively(path)
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
看起来您正在阅读的 PDF 文件不是有效的 PDF 文件,或者它们比 PyPDF 准备的更加奇特。您确定有好的 PDF 文件可供阅读吗?
另外,您的代码中有一些奇怪的事情,但这一个可能真的很重要:
您有一个
\O
转义序列,这不是您想要的。It looks like the PDF files you are reading are not valid PDF files, or they are more exotic than PyPDF is prepared for. Are you sure you have good PDF files to read?
Also, there are a few odd things in your code, but this one might really matter:
You have a
\O
escape sequence there which isn't what you want.