Django上传PDF然后运行脚本以刮擦PDF并输出结果
我正在尝试创建一个Django Web应用程序,该应用程序允许用户上传PDF,然后将脚本刮擦并输出并保存脚本刮擦的某些文本。
我能够找到一些代码来执行文件上传部分。 我有脚本可以刮擦PDF。 不确定如何将它们绑起来完成这项任务。
views.py
from django.shortcuts import redirect, render
from .models import Document
from .forms import DocumentForm
def my_view(request):
print(f"Great! You're using Python 3.6+. If you fail here, use the right version.")
message = 'Upload PDF'
# Handle file upload
if request.method == 'POST':
form = DocumentForm(request.POST, request.FILES)
if form.is_valid():
newdoc = Document(docfile=request.FILES['docfile'])
newdoc.save()
# Redirect to the document list after POST
return redirect('my-view')
else:
message = 'The form is not valid. Fix the following error:'
else:
form = DocumentForm() # An empty, unbound form
# Load documents for the list page
documents = Document.objects.all()
# Render list page with the documents and the form
context = {'documents': documents, 'form': form, 'message': message}
return render(request, 'list.html', context)
forms.py
from django import forms
class DocumentForm(forms.Form):
docfile = forms.FileField(label='Select a file')
models.py
from django.db import models
class Document(models.Model):
docfile = models.FileField(upload_to='documents/%Y/%m/%d')
list.html
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>webpage</title>
</head>
<body>
<!-- Upload form. Note enctype attribute! -->
<form action="{% url "my-view" %}" method="post" enctype="multipart/form-data">
{% csrf_token %}
{{ message }}
<p>{{ form.non_field_errors }}</p>
<!-- Select a file: text -->
<p>{{ form.docfile.label_tag }} {{ form.docfile.help_text }}</p>
<!-- choose file button -->
<p>
{{ form.docfile.errors }}
{{ form.docfile }}
</p>
<!-- Upload button -->
<p><input type="submit" value="Upload"/></p>
</form>
</body>
编辑添加了urls.py
urls.py
from django.urls import path
from .views import my_view
urlpatterns = [
path('', my_view, name='my-view')
]
scrape.py
想要输出并保存plan_name。
import os
import pdfplumber
import re
directory = r'C:User/Ant_Esc/Desktop'
for filename in os.listdir(directory):
if filename.endswith('.pdf'):
fullpath = os.path.join(directory, filename)
#print(fullpath)
all_text = ""
with pdfplumber.open(fullpath) as pdf:
for page in pdf.pages:
text = page.extract_text()
#print(text)
all_text += ' ' + text
all_text = all_text.replace('\n','')
pattern ='Plan Title/Name .*? Program/Discipline'
Plan_Name = re.findall(pattern, all_text,re.DOTALL)
for i in Plan_Name:
Plan_Name = i.removesuffix('Program/Discipline')
Plan_Name = Plan_Name.removeprefix('Plan Title/Name ')
I am attempting to create a Django web app that allows the user to upload a pdf then have a script scrape it and output and save certain text that the script scraped.
I was able to find some code to do the file upload part.
I have the script to scrape the pdf.
Not sure how to tie them togther to accomplish this task.
views.py
from django.shortcuts import redirect, render
from .models import Document
from .forms import DocumentForm
def my_view(request):
print(f"Great! You're using Python 3.6+. If you fail here, use the right version.")
message = 'Upload PDF'
# Handle file upload
if request.method == 'POST':
form = DocumentForm(request.POST, request.FILES)
if form.is_valid():
newdoc = Document(docfile=request.FILES['docfile'])
newdoc.save()
# Redirect to the document list after POST
return redirect('my-view')
else:
message = 'The form is not valid. Fix the following error:'
else:
form = DocumentForm() # An empty, unbound form
# Load documents for the list page
documents = Document.objects.all()
# Render list page with the documents and the form
context = {'documents': documents, 'form': form, 'message': message}
return render(request, 'list.html', context)
forms.py
from django import forms
class DocumentForm(forms.Form):
docfile = forms.FileField(label='Select a file')
models.py
from django.db import models
class Document(models.Model):
docfile = models.FileField(upload_to='documents/%Y/%m/%d')
list.html
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>webpage</title>
</head>
<body>
<!-- Upload form. Note enctype attribute! -->
<form action="{% url "my-view" %}" method="post" enctype="multipart/form-data">
{% csrf_token %}
{{ message }}
<p>{{ form.non_field_errors }}</p>
<!-- Select a file: text -->
<p>{{ form.docfile.label_tag }} {{ form.docfile.help_text }}</p>
<!-- choose file button -->
<p>
{{ form.docfile.errors }}
{{ form.docfile }}
</p>
<!-- Upload button -->
<p><input type="submit" value="Upload"/></p>
</form>
</body>
edit added urls.py
urls.py
from django.urls import path
from .views import my_view
urlpatterns = [
path('', my_view, name='my-view')
]
Scrape.py
Want to output and save Plan_Name.
import os
import pdfplumber
import re
directory = r'C:User/Ant_Esc/Desktop'
for filename in os.listdir(directory):
if filename.endswith('.pdf'):
fullpath = os.path.join(directory, filename)
#print(fullpath)
all_text = ""
with pdfplumber.open(fullpath) as pdf:
for page in pdf.pages:
text = page.extract_text()
#print(text)
all_text += ' ' + text
all_text = all_text.replace('\n','')
pattern ='Plan Title/Name .*? Program/Discipline'
Plan_Name = re.findall(pattern, all_text,re.DOTALL)
for i in Plan_Name:
Plan_Name = i.removesuffix('Program/Discipline')
Plan_Name = Plan_Name.removeprefix('Plan Title/Name ')
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
我已经浏览了您的代码,您可以在下面的两个查询上确认吗?我觉得这些缺失。
?我的建议是,您可以在view.py.py newdoc.save()中保存成功的文件后调用srap.py,也可以使用超级方法从模型中调用scrap.py。
让我知道您是否需要更多帮助。
I've gone through your code , can you confirm on below two queries? i feel these are missing.
My suggestion is you can call srap.py after successfull file save in view.py newdoc.save() or you can call scrap.py from model using super method.
Let me know if you need more help on this.