可以在URL中具有动态值的Scrape API

发布于 2025-02-08 07:22:14 字数 1024 浏览 2 评论 0原文

我尝试从URL中具有动态值的INSEE数据库中刮擦警笛信息。 状态代码必须为200或299。我的结果是无,无

import pandas as pd

import requests

def extract_siren_code(siren):

    siren_recup, features = None, None

    base_url = "https://api.insee.fr/entreprises/sirene/V3/siren/"

    endpoint = f"{base_url}{siren}"

    headers = {"Authorization": "Bearer <my bearer token>", "Accept": "application/json"}

    response = requests.get(endpoint, headers=headers)

    if response.status_code not in range(200, 299):
        return None, None
    try:
        '''
        This try block incase any of our inputs are invalid. This is done instead
        of actually writing out handlers for all kinds of responses.
        '''
        results = response.json()['uniteLegale'][0]
        print(results)
        siren_recup = results['siren']
        features = ['uniteLegale']

    except:
        pass
    return siren_recup, features
siren_recup, features = extract_siren_code('824239214')

print(siren_recup, features)

I try to scrape siren information from the Insee database with a dynamic value in the URL.
The status-code have to be 200 or 299. The result that I have, is None, None.

import pandas as pd

import requests

def extract_siren_code(siren):

    siren_recup, features = None, None

    base_url = "https://api.insee.fr/entreprises/sirene/V3/siren/"

    endpoint = f"{base_url}{siren}"

    headers = {"Authorization": "Bearer <my bearer token>", "Accept": "application/json"}

    response = requests.get(endpoint, headers=headers)

    if response.status_code not in range(200, 299):
        return None, None
    try:
        '''
        This try block incase any of our inputs are invalid. This is done instead
        of actually writing out handlers for all kinds of responses.
        '''
        results = response.json()['uniteLegale'][0]
        print(results)
        siren_recup = results['siren']
        features = ['uniteLegale']

    except:
        pass
    return siren_recup, features
siren_recup, features = extract_siren_code('824239214')

print(siren_recup, features)

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(2

怕倦 2025-02-15 07:22:14

实际上,有些公司只有一个unitelegale,而API由dict而不是一个dict的列表回复,因此您需要为这种情况添加条件:

import pandas as pd

import requests

def extract_siren_code(siren):

    siren_recup, features = None, None

    base_url = "https://api.insee.fr/entreprises/sirene/V3/siren/"

    endpoint = f"{base_url}{siren}"

    headers = {"Authorization": "Bearer <my bearer token>", "Accept": "application/json"}

    response = requests.get(endpoint, headers=headers)

    if response.status_code not in range(200, 299):
        return None, None
    try:
        '''
        This try block incase any of our inputs are invalid. This is done instead
        of actually writing out handlers for all kinds of responses.
        '''
        unite_legale = response.json()['uniteLegale']
        results = unite_legale[0] if isinstance(unite_legale, list) else unite_legale
        siren_recup = results['siren']
        features = ['uniteLegale']

    except:
        pass
    return siren_recup, features

actually some companies have only one uniteLegale and the API reply by a dict instead of a list of one dict, so you need to add a condition for this case:

import pandas as pd

import requests

def extract_siren_code(siren):

    siren_recup, features = None, None

    base_url = "https://api.insee.fr/entreprises/sirene/V3/siren/"

    endpoint = f"{base_url}{siren}"

    headers = {"Authorization": "Bearer <my bearer token>", "Accept": "application/json"}

    response = requests.get(endpoint, headers=headers)

    if response.status_code not in range(200, 299):
        return None, None
    try:
        '''
        This try block incase any of our inputs are invalid. This is done instead
        of actually writing out handlers for all kinds of responses.
        '''
        unite_legale = response.json()['uniteLegale']
        results = unite_legale[0] if isinstance(unite_legale, list) else unite_legale
        siren_recup = results['siren']
        features = ['uniteLegale']

    except:
        pass
    return siren_recup, features
妄想挽回 2025-02-15 07:22:14
# -*- coding: utf-8 -*-
import pandas as pd
import requests


GOOGLE_API_KEY = 'AIzaSyBsMyay1fCz3gHysNXec8TbClnwYaprwVI' 

def extract_lat_long_via_address(address_or_zipcode):
    formatted_address, lat, lng = None, None, None
    api_key = GOOGLE_API_KEY
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    
    endpoint = f"{base_url}?address={address_or_zipcode}&key={api_key}"
    # see how our endpoint includes our API key? Yes this is yet another reason to restrict the key
    r = requests.get(endpoint)
#    print(r.text)
    if r.status_code not in range(200, 299):
        return None, None, None
    try:
        '''
        This try block incase any of our inputs are invalid. This is done instead
        of actually writing out handlers for all kinds of responses.
        '''
        results = r.json()['results'][0]
        formatted_address = results['formatted_address']
        lat = results['geometry']['location']['lat']
        lng = results['geometry']['location']['lng']
    except:
        pass
    return formatted_address, lat, lng

address_list= ds1['Adresse_N°_Niv_Typ_Voie'].tolist()

#address_list = ['2 rue de la garbotiere', '137 Rue de la chocolaterie', '94 Allée de Ailes', '12 Rue de la Federation', 'Industries traat 38 2500 LIER', '262 Avenue Albert Calmette']

Adresse2 = pd.DataFrame()

for adress in address_list:
    new_address_data = pd.DataFrame([extract_lat_long_via_address(adress)], columns=['formatted_address','latitude', 'longitude'])
    Adresse2 = pd.concat([Adresse2, new_address_data], axis=0, ignore_index=True)
# -*- coding: utf-8 -*-
import pandas as pd
import requests


GOOGLE_API_KEY = 'AIzaSyBsMyay1fCz3gHysNXec8TbClnwYaprwVI' 

def extract_lat_long_via_address(address_or_zipcode):
    formatted_address, lat, lng = None, None, None
    api_key = GOOGLE_API_KEY
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    
    endpoint = f"{base_url}?address={address_or_zipcode}&key={api_key}"
    # see how our endpoint includes our API key? Yes this is yet another reason to restrict the key
    r = requests.get(endpoint)
#    print(r.text)
    if r.status_code not in range(200, 299):
        return None, None, None
    try:
        '''
        This try block incase any of our inputs are invalid. This is done instead
        of actually writing out handlers for all kinds of responses.
        '''
        results = r.json()['results'][0]
        formatted_address = results['formatted_address']
        lat = results['geometry']['location']['lat']
        lng = results['geometry']['location']['lng']
    except:
        pass
    return formatted_address, lat, lng

address_list= ds1['Adresse_N°_Niv_Typ_Voie'].tolist()

#address_list = ['2 rue de la garbotiere', '137 Rue de la chocolaterie', '94 Allée de Ailes', '12 Rue de la Federation', 'Industries traat 38 2500 LIER', '262 Avenue Albert Calmette']

Adresse2 = pd.DataFrame()

for adress in address_list:
    new_address_data = pd.DataFrame([extract_lat_long_via_address(adress)], columns=['formatted_address','latitude', 'longitude'])
    Adresse2 = pd.concat([Adresse2, new_address_data], axis=0, ignore_index=True)
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文