Django上传PDF然后运行脚本以刮擦PDF并输出结果

发布于 2025-01-30 07:21:07 字数 3456 浏览 2 评论 0原文

我正在尝试创建一个Django Web应用程序，该应用程序允许用户上传PDF，然后将脚本刮擦并输出并保存脚本刮擦的某些文本。

我能够找到一些代码来执行文件上传部分。我有脚本可以刮擦PDF。不确定如何将它们绑起来完成这项任务。

views.py

from django.shortcuts import redirect, render
from .models import Document
from .forms import DocumentForm

def my_view(request):
    print(f"Great! You're using Python 3.6+. If you fail here, use the right version.")
    message = 'Upload PDF'
    # Handle file upload
    if request.method == 'POST':
        form = DocumentForm(request.POST, request.FILES)
        if form.is_valid():
            newdoc = Document(docfile=request.FILES['docfile'])
            newdoc.save()

        # Redirect to the document list after POST
        return redirect('my-view')
    else:
        message = 'The form is not valid. Fix the following error:'
else:
    form = DocumentForm()  # An empty, unbound form

# Load documents for the list page
documents = Document.objects.all()

# Render list page with the documents and the form
context = {'documents': documents, 'form': form, 'message': message}
return render(request, 'list.html', context)

forms.py

from django import forms

class DocumentForm(forms.Form):
    docfile = forms.FileField(label='Select a file')

models.py

from django.db import models

class Document(models.Model):
    docfile = models.FileField(upload_to='documents/%Y/%m/%d')

list.html

<!DOCTYPE html>
<html>
    <head>
        <meta charset="utf-8">
<title>webpage</title>
    </head>
<body>
    <!-- Upload form. Note enctype attribute! -->
    <form action="{% url "my-view" %}" method="post" enctype="multipart/form-data">
        {% csrf_token %}
        {{ message }}
        <p>{{ form.non_field_errors }}</p>

        <!-- Select a file: text -->
        <p>{{ form.docfile.label_tag }} {{ form.docfile.help_text }}</p>

        <!-- choose file button -->
        <p>
            {{ form.docfile.errors }}
            {{ form.docfile }}
        </p>

        <!-- Upload button -->

        <p><input type="submit" value="Upload"/></p>
    </form>
</body>

编辑添加了urls.py
urls.py

from django.urls import path
from .views import my_view

urlpatterns = [
    path('', my_view, name='my-view')
]

scrape.py
想要输出并保存plan_name。

import os
import pdfplumber
import re
directory = r'C:User/Ant_Esc/Desktop'

for filename in os.listdir(directory):
    if filename.endswith('.pdf'):
        fullpath = os.path.join(directory, filename)
        #print(fullpath)
        all_text = ""
        with pdfplumber.open(fullpath) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                #print(text)
                all_text += ' ' + text
                all_text = all_text.replace('\n','')
            pattern ='Plan Title/Name  .*? Program/Discipline'
            Plan_Name = re.findall(pattern, all_text,re.DOTALL)
            for i in Plan_Name:
                Plan_Name = i.removesuffix('Program/Discipline')
                Plan_Name = Plan_Name.removeprefix('Plan Title/Name  ')

原文

I am attempting to create a Django web app that allows the user to upload a pdf then have a script scrape it and output and save certain text that the script scraped.

I was able to find some code to do the file upload part.
I have the script to scrape the pdf.
Not sure how to tie them togther to accomplish this task.

views.py

from django.shortcuts import redirect, render
from .models import Document
from .forms import DocumentForm

def my_view(request):
    print(f"Great! You're using Python 3.6+. If you fail here, use the right version.")
    message = 'Upload PDF'
    # Handle file upload
    if request.method == 'POST':
        form = DocumentForm(request.POST, request.FILES)
        if form.is_valid():
            newdoc = Document(docfile=request.FILES['docfile'])
            newdoc.save()

        # Redirect to the document list after POST
        return redirect('my-view')
    else:
        message = 'The form is not valid. Fix the following error:'
else:
    form = DocumentForm()  # An empty, unbound form

# Load documents for the list page
documents = Document.objects.all()

# Render list page with the documents and the form
context = {'documents': documents, 'form': form, 'message': message}
return render(request, 'list.html', context)

forms.py

from django import forms

class DocumentForm(forms.Form):
    docfile = forms.FileField(label='Select a file')

models.py

from django.db import models

class Document(models.Model):
    docfile = models.FileField(upload_to='documents/%Y/%m/%d')

list.html

<!DOCTYPE html>
<html>
    <head>
        <meta charset="utf-8">
<title>webpage</title>
    </head>
<body>
    <!-- Upload form. Note enctype attribute! -->
    <form action="{% url "my-view" %}" method="post" enctype="multipart/form-data">
        {% csrf_token %}
        {{ message }}
        <p>{{ form.non_field_errors }}</p>

        <!-- Select a file: text -->
        <p>{{ form.docfile.label_tag }} {{ form.docfile.help_text }}</p>

        <!-- choose file button -->
        <p>
            {{ form.docfile.errors }}
            {{ form.docfile }}
        </p>

        <!-- Upload button -->

        <p><input type="submit" value="Upload"/></p>
    </form>
</body>

edit added urls.py
urls.py

from django.urls import path
from .views import my_view

urlpatterns = [
    path('', my_view, name='my-view')
]

Scrape.py
Want to output and save Plan_Name.

import os
import pdfplumber
import re
directory = r'C:User/Ant_Esc/Desktop'

for filename in os.listdir(directory):
    if filename.endswith('.pdf'):
        fullpath = os.path.join(directory, filename)
        #print(fullpath)
        all_text = ""
        with pdfplumber.open(fullpath) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                #print(text)
                all_text += ' ' + text
                all_text = all_text.replace('\n','')
            pattern ='Plan Title/Name  .*? Program/Discipline'
            Plan_Name = re.findall(pattern, all_text,re.DOTALL)
            for i in Plan_Name:
                Plan_Name = i.removesuffix('Program/Discipline')
                Plan_Name = Plan_Name.removeprefix('Plan Title/Name  ')

分享到QQ

分享到微博