使用python将数据写入excel时索引超出范围异常

发布于 2025-01-10 00:28:00 字数 2632 浏览 0 评论 0原文

从 URL 获取数据时出现错误，数据 [][] 中索引超出范围异常。

import requests
from bs4 import BeautifulSoup
import openpyxl
import uuid

row_number=2

for i in range(1,3):
  website_url = "https://www.example.com/job_search?page=1&txtKeyword=IT&keyword=&txtLocation=Thiruvananthapuram%2C&page="+str(i)
  res = requests.get(website_url, verify=False)
  soup = BeautifulSoup(res.text, 'lxml')
  Links = soup.find_all('div', {'class': 'col-md-6 col-sm-12'},)

  data1=[]
  data2=[]
  for div_block in soup.find_all('dl', class_=['description-list'],style=None):
        data1.append([line.strip() for line in div_block.stripped_strings])
  for div_block in soup.find_all('div', class_=['details full-width'],style=None):
        data2.append([line.strip() for line in div_block.stripped_strings])
  url = [tag.find('a')['href'] for tag in Links]
  wb = openpyxl.Workbook()


# Write a header row
  columns = [
      ("job_listing_id"),#1
      ("unique_hash"),#2`enter code here`
      ("status"),#3
      ("primary_skills"),#4
      ("secondary_skills"),#5
      ("title"),#6
      ("description"),#7
      ("job_type"),#8
      ("source"),#9
      ("experience"),#10
      ("location"),#11
      ("company"),#12
      ("posted_date"),#13
      ("expiryDate"),#14
      ("vacancies"),#15
      ("company_website"),#16
      ("posted_by")#17
       ]
  ws = wb.active

  for col_number, value, in enumerate(columns, start=1)   :
      ws.cell(column=col_number, row=1, value=value)

  row_number=row_number

  y=0
  id=uuid.uuid4

  for x in url:
      res = requests.get(f'{x}', verify=False)
      soup = BeautifulSoup(res.text, 'lxml')
      data = []

      for div_block in soup.find_all('div', class_=['content-block-normal','details full-width'], style=None) :
          data.append([line.strip() for line in div_block.stripped_strings])
    
    # Write a data row
      row = [
        
          row_number-1,  # job_listing_id
          str(uuid.uuid4()), # Unique Hash
          'NEW',#status
          data1[y][5],#primary_skills
          '',#secondary_skills
          data2[y][0],  # Job title
          '\n'.join(data[2][1:]) ,#description`
          'NA',#job_type
          'NA', #source
          data1[y][1],#EXp
          data[1][4],#Location
          data2[y][1],#company
          data[0][3],  # posted on
          '', #expiryDate
          '',#vacancies
          '',#company_website
          'example'#posted_by
      ]
      y+=1

      for col_number, value in enumerate(row, start=1):
          cell = ws.cell(column=col_number, row=row_number, value=value)
      row_number += 1

原文

While taking data from the URL I got an error, index out of range exception in the data[][].

import requests
from bs4 import BeautifulSoup
import openpyxl
import uuid

row_number=2

for i in range(1,3):
  website_url = "https://www.example.com/job_search?page=1&txtKeyword=IT&keyword=&txtLocation=Thiruvananthapuram%2C&page="+str(i)
  res = requests.get(website_url, verify=False)
  soup = BeautifulSoup(res.text, 'lxml')
  Links = soup.find_all('div', {'class': 'col-md-6 col-sm-12'},)

  data1=[]
  data2=[]
  for div_block in soup.find_all('dl', class_=['description-list'],style=None):
        data1.append([line.strip() for line in div_block.stripped_strings])
  for div_block in soup.find_all('div', class_=['details full-width'],style=None):
        data2.append([line.strip() for line in div_block.stripped_strings])
  url = [tag.find('a')['href'] for tag in Links]
  wb = openpyxl.Workbook()


# Write a header row
  columns = [
      ("job_listing_id"),#1
      ("unique_hash"),#2`enter code here`
      ("status"),#3
      ("primary_skills"),#4
      ("secondary_skills"),#5
      ("title"),#6
      ("description"),#7
      ("job_type"),#8
      ("source"),#9
      ("experience"),#10
      ("location"),#11
      ("company"),#12
      ("posted_date"),#13
      ("expiryDate"),#14
      ("vacancies"),#15
      ("company_website"),#16
      ("posted_by")#17
       ]
  ws = wb.active

  for col_number, value, in enumerate(columns, start=1)   :
      ws.cell(column=col_number, row=1, value=value)

  row_number=row_number

  y=0
  id=uuid.uuid4

  for x in url:
      res = requests.get(f'{x}', verify=False)
      soup = BeautifulSoup(res.text, 'lxml')
      data = []

      for div_block in soup.find_all('div', class_=['content-block-normal','details full-width'], style=None) :
          data.append([line.strip() for line in div_block.stripped_strings])
    
    # Write a data row
      row = [
        
          row_number-1,  # job_listing_id
          str(uuid.uuid4()), # Unique Hash
          'NEW',#status
          data1[y][5],#primary_skills
          '',#secondary_skills
          data2[y][0],  # Job title
          '\n'.join(data[2][1:]) ,#description`
          'NA',#job_type
          'NA', #source
          data1[y][1],#EXp
          data[1][4],#Location
          data2[y][1],#company
          data[0][3],  # posted on
          '', #expiryDate
          '',#vacancies
          '',#company_website
          'example'#posted_by
      ]
      y+=1

      for col_number, value in enumerate(row, start=1):
          cell = ws.cell(column=col_number, row=row_number, value=value)
      row_number += 1

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

盗琴音 2025-01-17 00:28:00

对于您在评论中给出的 URL，数据以 JSON 形式返回，可以直接访问，如下所示：

import requests

url = 'https://booking.snav.it/api/v1/rates/1040/2019-02-25/1042/2019-02-25?lang=1'
req = requests.get(url, verify=False)
data = req.json()  

print(data['data']['itineraryOutward']['description'])

这会将 description 显示为：

NAPOLI BEVERELLO - ISCHIA CASAMICCIOLA

对于您的其他站点，您将需要调查尝试使用他们的API。以下内容应该可以帮助您开始：

import requests
from bs4 import BeautifulSoup
import openpyxl


headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
    'Host' : 'jobseeker-api.hirist.com',
    'Referer' : 'https://www.hirist.com/',
    'Authorization' : 'Bearer undefined',
    'Origin' : 'https://www.hirist.com',
}

params = {
    'pageNo' : '0',
    'query' : 'software Engineer',
    'loc' : '17',
    'minexp' : '0',
    'maxexp' : '0',
    'range' : '0',
    'boost' : '0',
    'searchRange' : '4',
    'searchOp' : 'AND',
    'jobType' : '1',
}

with requests.Session() as sess:
    url = "https://jobseeker-api.hirist.com/jobfeed/-1/search"
    req_search = sess.get(url, params=params, headers=headers)

data = req_search.json()

wb = openpyxl.Workbook()
ws = wb.active
ws.append(['title', 'min_years', 'max_years'])

for job in data['jobs']:
    row = [
        job['title'],
        job['min'],
        job['max'],
    ]
    
    ws.append(row)

wb.save('output.xlsx')

为您提供开始数据：

希望这有帮助

For the URL you have given in the comment, the data is returned as JSON which can be accessed directly as follows:

import requests

url = 'https://booking.snav.it/api/v1/rates/1040/2019-02-25/1042/2019-02-25?lang=1'
req = requests.get(url, verify=False)
data = req.json()  

print(data['data']['itineraryOutward']['description'])

This would display description as:

NAPOLI BEVERELLO - ISCHIA CASAMICCIOLA

For your other site, you will need to investigate trying to use their API. The following should get you started:

import requests
from bs4 import BeautifulSoup
import openpyxl


headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36',
    'Host' : 'jobseeker-api.hirist.com',
    'Referer' : 'https://www.hirist.com/',
    'Authorization' : 'Bearer undefined',
    'Origin' : 'https://www.hirist.com',
}

params = {
    'pageNo' : '0',
    'query' : 'software Engineer',
    'loc' : '17',
    'minexp' : '0',
    'maxexp' : '0',
    'range' : '0',
    'boost' : '0',
    'searchRange' : '4',
    'searchOp' : 'AND',
    'jobType' : '1',
}

with requests.Session() as sess:
    url = "https://jobseeker-api.hirist.com/jobfeed/-1/search"
    req_search = sess.get(url, params=params, headers=headers)

data = req_search.json()

wb = openpyxl.Workbook()
ws = wb.active
ws.append(['title', 'min_years', 'max_years'])

for job in data['jobs']:
    row = [
        job['title'],
        job['min'],
        job['max'],
    ]
    
    ws.append(row)

wb.save('output.xlsx')

Giving you data starting: