将文件从Python传递到Bash脚本的Heredoc进行下载

发布于 2025-01-22 16:38:41 字数 4837 浏览 0 评论 0原文

我从EarthData搜索下载了一个Shell脚本，如下所示：

下载。SH

#!/bin/bash

GREP_OPTIONS=''

cookiejar=$(mktemp cookies.XXXXXXXXXX)
netrc=$(mktemp netrc.XXXXXXXXXX)
chmod 0600 "$cookiejar" "$netrc"
function finish {
  rm -rf "$cookiejar" "$netrc"
}

trap finish EXIT
WGETRC="$wgetrc"

prompt_credentials() {
    echo "Enter your Earthdata Login or other provider supplied credentials"
    read -p "Username (tylersingleton): " username
    username=${username:-tylersingleton}
    read -s -p "Password: " password
    echo "machine urs.earthdata.nasa.gov login $username password $password" >> $netrc
    echo
}

exit_with_error() {
    echo
    echo "Unable to Retrieve Data"
    echo
    echo $1
    echo
    echo "https://n5eil01u.ecs.nsidc.org/DP4/SMAP/SPL3SMP.008/2019.03.09/SMAP_L3_SM_P_20190309_R18290_001.h5"
    echo
    exit 1
}

prompt_credentials
  detect_app_approval() {
    approved=`curl -s -b "$cookiejar" -c "$cookiejar" -L --max-redirs 5 --netrc-file "$netrc" https://n5eil01u.ecs.nsidc.org/DP4/SMAP/SPL3SMP.008/2019.03.09/SMAP_L3_SM_P_20190309_R18290_001.h5 -w %{http_code} | tail  -1`
    if [ "$approved" -ne "302" ]; then
        # User didn't approve the app. Direct users to approve the app in URS
        exit_with_error "Please ensure that you have authorized the remote application by visiting the link below "
    fi
}

setup_auth_curl() {
    # Firstly, check if it require URS authentication
    status=$(curl -s -z "$(date)" -w %{http_code} https://n5eil01u.ecs.nsidc.org/DP4/SMAP/SPL3SMP.008/2019.03.09/SMAP_L3_SM_P_20190309_R18290_001.h5 | tail -1)
    if [[ "$status" -ne "200" && "$status" -ne "304" ]]; then
        # URS authentication is required. Now further check if the application/remote service is approved.
        detect_app_approval
    fi
}

setup_auth_wget() {
    # The safest way to auth via curl is netrc. Note: there's no checking or feedback
    # if login is unsuccessful
    touch ~/.netrc
    chmod 0600 ~/.netrc
    credentials=$(grep 'machine urs.earthdata.nasa.gov' ~/.netrc)
    if [ -z "$credentials" ]; then
        cat "$netrc" >> ~/.netrc
    fi
}

fetch_urls() {
  if command -v curl >/dev/null 2>&1; then
      setup_auth_curl
      while read -r line; do
        # Get everything after the last '/'
        filename="${line##*/}"

        # Strip everything after '?'
        stripped_query_params="${filename%%\?*}"

        curl -f -b "$cookiejar" -c "$cookiejar" -L --netrc-file "$netrc" -g -o $stripped_query_params -- $line && echo || exit_with_error "Command failed with error. Please retrieve the data manually."
      done;
  elif command -v wget >/dev/null 2>&1; then
      # We can't use wget to poke provider server to get info whether or not URS was integrated without download at least one of the files.
      echo
      echo "WARNING: Can't find curl, use wget instead."
      echo "WARNING: Script may not correctly identify Earthdata Login integrations."
      echo
      setup_auth_wget
      while read -r line; do
        # Get everything after the last '/'
        filename="${line##*/}"

        # Strip everything after '?'
        stripped_query_params="${filename%%\?*}"

        wget --load-cookies "$cookiejar" --save-cookies "$cookiejar" --output-document $stripped_query_params --keep-session-cookies -- $line && echo || exit_with_error "Command failed with error. Please retrieve the data manually."
      done;
  else
      exit_with_error "Error: Could not find a command-line downloader.  Please install curl or wget"
  fi
}

fetch_urls <<'EDSCEOF'
https://n5eil01u.ecs.nsidc.org/DP4/SMAP/SPL3SMP.008/2019.03.09/SMAP_L3_SM_P_20190309_R18290_001.h5
...
https://n5eil01u.ecs.nsidc.org/DP4/SMAP/SPL3SMP.008/2019.03.08/SMAP_L3_SM_P_20190308_R18290_001.h5
EDSCEOF

底部，将URL的列表重定向为HEREDOC，以fetch_urls函数。我一直在尝试删除此部分，并将其包含在文本文件中，我可以将其作为参数将其传递给downloads.shs.sh在Python中，因为

import subprocess

subprocess.run(['download.sh, URLs.txt'], shell=True)

我尝试编辑我的bash脚本以fetch_urls接受变量作为输入。

fetch_urls $1

and 

URLs=$(cat URLs.txt)
fetch_urls <<'EDSCEOF'
$URLs
EDSCEOF

and 

while read -r url; do fetch_urls <<< echo "$url"; done < URLs.txt

and 

fetch_urls <<'EDSCEOF'
while read -r url; do echo "$url"; done < URLs.txt
EDSCEOF

但是我对bash一无所知，也无法弄清楚应该如何做到这一点。此外，我想将下载文件的输出设置为自己的文件。即，我试图拥有这样的文件结构：

.
|--- main.py
|--- data_folder
    |--- download.sh
    |--- URLs_1.txt
    |--- URLs_2.txt
    |--- folder_1
        |--- URLs_1_Data
    |--- folder_2
        |--- URLs_2_Data

因此，关于搜索文档中的任何方向将有助于此。在Python的子过程中，我可以更改CWD，但这将导致我的数据与BASH脚本同一文件下载。我宁愿避免这种情况，只是能够将两个变量传递给bash脚本。 1）要使用的URL TXT文件的位置； 2）在哪里保存下载的数据。

原文

I have a shell script downloaded from EarthData Search as follows:

download.sh

#!/bin/bash

GREP_OPTIONS=''

cookiejar=$(mktemp cookies.XXXXXXXXXX)
netrc=$(mktemp netrc.XXXXXXXXXX)
chmod 0600 "$cookiejar" "$netrc"
function finish {
  rm -rf "$cookiejar" "$netrc"
}

trap finish EXIT
WGETRC="$wgetrc"

prompt_credentials() {
    echo "Enter your Earthdata Login or other provider supplied credentials"
    read -p "Username (tylersingleton): " username
    username=${username:-tylersingleton}
    read -s -p "Password: " password
    echo "machine urs.earthdata.nasa.gov login $username password $password" >> $netrc
    echo
}

exit_with_error() {
    echo
    echo "Unable to Retrieve Data"
    echo
    echo $1
    echo
    echo "https://n5eil01u.ecs.nsidc.org/DP4/SMAP/SPL3SMP.008/2019.03.09/SMAP_L3_SM_P_20190309_R18290_001.h5"
    echo
    exit 1
}

prompt_credentials
  detect_app_approval() {
    approved=`curl -s -b "$cookiejar" -c "$cookiejar" -L --max-redirs 5 --netrc-file "$netrc" https://n5eil01u.ecs.nsidc.org/DP4/SMAP/SPL3SMP.008/2019.03.09/SMAP_L3_SM_P_20190309_R18290_001.h5 -w %{http_code} | tail  -1`
    if [ "$approved" -ne "302" ]; then
        # User didn't approve the app. Direct users to approve the app in URS
        exit_with_error "Please ensure that you have authorized the remote application by visiting the link below "
    fi
}

setup_auth_curl() {
    # Firstly, check if it require URS authentication
    status=$(curl -s -z "$(date)" -w %{http_code} https://n5eil01u.ecs.nsidc.org/DP4/SMAP/SPL3SMP.008/2019.03.09/SMAP_L3_SM_P_20190309_R18290_001.h5 | tail -1)
    if [[ "$status" -ne "200" && "$status" -ne "304" ]]; then
        # URS authentication is required. Now further check if the application/remote service is approved.
        detect_app_approval
    fi
}

setup_auth_wget() {
    # The safest way to auth via curl is netrc. Note: there's no checking or feedback
    # if login is unsuccessful
    touch ~/.netrc
    chmod 0600 ~/.netrc
    credentials=$(grep 'machine urs.earthdata.nasa.gov' ~/.netrc)
    if [ -z "$credentials" ]; then
        cat "$netrc" >> ~/.netrc
    fi
}

fetch_urls() {
  if command -v curl >/dev/null 2>&1; then
      setup_auth_curl
      while read -r line; do
        # Get everything after the last '/'
        filename="${line##*/}"

        # Strip everything after '?'
        stripped_query_params="${filename%%\?*}"

        curl -f -b "$cookiejar" -c "$cookiejar" -L --netrc-file "$netrc" -g -o $stripped_query_params -- $line && echo || exit_with_error "Command failed with error. Please retrieve the data manually."
      done;
  elif command -v wget >/dev/null 2>&1; then
      # We can't use wget to poke provider server to get info whether or not URS was integrated without download at least one of the files.
      echo
      echo "WARNING: Can't find curl, use wget instead."
      echo "WARNING: Script may not correctly identify Earthdata Login integrations."
      echo
      setup_auth_wget
      while read -r line; do
        # Get everything after the last '/'
        filename="${line##*/}"

        # Strip everything after '?'
        stripped_query_params="${filename%%\?*}"

        wget --load-cookies "$cookiejar" --save-cookies "$cookiejar" --output-document $stripped_query_params --keep-session-cookies -- $line && echo || exit_with_error "Command failed with error. Please retrieve the data manually."
      done;
  else
      exit_with_error "Error: Could not find a command-line downloader.  Please install curl or wget"
  fi
}

fetch_urls <<'EDSCEOF'
https://n5eil01u.ecs.nsidc.org/DP4/SMAP/SPL3SMP.008/2019.03.09/SMAP_L3_SM_P_20190309_R18290_001.h5
...
https://n5eil01u.ecs.nsidc.org/DP4/SMAP/SPL3SMP.008/2019.03.08/SMAP_L3_SM_P_20190308_R18290_001.h5
EDSCEOF

At the bottom, a list of URLs is redirected as a heredoc into the the fetch_urls function. I have been attempting to remove this portion and have the URLs housed in a text file that I can pass as an argument to downloads.sh in python as

import subprocess

subprocess.run(['download.sh, URLs.txt'], shell=True)

I have tried editing my bash script to have fetch_urls accept a variable as an input.

fetch_urls $1

and 

URLs=$(cat URLs.txt)
fetch_urls <<'EDSCEOF'
$URLs
EDSCEOF

and 

while read -r url; do fetch_urls <<< echo "$url"; done < URLs.txt

and 

fetch_urls <<'EDSCEOF'
while read -r url; do echo "$url"; done < URLs.txt
EDSCEOF

But I know nothing about bash, and cannot figure out how this should be done. Additionally, I would like to set the output of the downloaded files to be redirected to its own file. i.e. I am attempting to have a file structure like this:

.
|--- main.py
|--- data_folder
    |--- download.sh
    |--- URLs_1.txt
    |--- URLs_2.txt
    |--- folder_1
        |--- URLs_1_Data
    |--- folder_2
        |--- URLs_2_Data

So any direction as to where in docs to search would be helpful for this. In python's subprocess, I can change the CWD, but this will cause my data to be downloaded in the same file as the bash script. I would rather avoid this, and simply be able to pass two variables to the bash script. 1) Location of the URL txt file to use; 2) Where to save the downloaded data.

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

故事与诗 2025-01-29 16:38:41

我无法弄清楚如何用bash来实现这一目标。提供的答案仅部分用于我的用途。如果文件仅包含一个URL，则它将起作用；但是，更多的东西将冻结该程序。我已经找到了使用Python的解决方案，我的项目是基于的。

import requests  # get the requests library from https://github.com/requests/requests


# overriding requests.Session.rebuild_auth to maintain headers when redirected
class SessionWithHeaderRedirection(requests.Session):
    AUTH_HOST = 'urs.earthdata.nasa.gov'

    def __init__(self, username, password):
        super().__init__()
        self.auth = (username, password)

    # Overrides from the library to keep headers when redirected to or from
    # the NASA auth host.
    def rebuild_auth(self, prepared_request, response):
        headers = prepared_request.headers
        url = prepared_request.url
        if 'Authorization' in headers:
            original_parsed = requests.utils.urlparse(response.request.url)
            redirect_parsed = requests.utils.urlparse(url)
            if (original_parsed.hostname != redirect_parsed.hostname) and \
                    redirect_parsed.hostname != self.AUTH_HOST and \
                    original_parsed.hostname != self.AUTH_HOST:
                del headers['Authorization']
        return


# create session with the user credentials that will be used to authenticate access to the data
username = "**********"
password = "**********"
session = SessionWithHeaderRedirection(username, password)

# the url of the file we wish to retrieve
# and remove newline character from end of urls
with open('test.txt') as url_file:
    urls = [url.strip('\n') for url in list(url_file)]

for url in urls:
    # extract the filename from the url to be used when saving the file
    filename = url[url.rfind('/') + 1:]

    try:
        # submit the request using the session
        response = session.get(url, stream=True)
        print(response.status_code)
        # raise an exception in case of http errors
        response.raise_for_status()
        # save the file

        with open(filename, 'wb') as fd:
            for chunk in response.iter_content(chunk_size=1024 * 1024):
                fd.write(chunk)

    except requests.exceptions.HTTPError as e:
        # handle any errors here
        print(e)

此代码是从 https://urs.earthdata.nasa.gov/documentation/for_users/data_access/python

我只对其进行了一些修改以与我的项目一起使用。

I could not figure out how to accomplish this with bash. The answers provided only partly worked for my uses. If the file only contained one url, then it would work; however, any more would freeze the program. I have found a solution using python, which my project is based in.

import requests  # get the requests library from https://github.com/requests/requests


# overriding requests.Session.rebuild_auth to maintain headers when redirected
class SessionWithHeaderRedirection(requests.Session):
    AUTH_HOST = 'urs.earthdata.nasa.gov'

    def __init__(self, username, password):
        super().__init__()
        self.auth = (username, password)

    # Overrides from the library to keep headers when redirected to or from
    # the NASA auth host.
    def rebuild_auth(self, prepared_request, response):
        headers = prepared_request.headers
        url = prepared_request.url
        if 'Authorization' in headers:
            original_parsed = requests.utils.urlparse(response.request.url)
            redirect_parsed = requests.utils.urlparse(url)
            if (original_parsed.hostname != redirect_parsed.hostname) and \
                    redirect_parsed.hostname != self.AUTH_HOST and \
                    original_parsed.hostname != self.AUTH_HOST:
                del headers['Authorization']
        return


# create session with the user credentials that will be used to authenticate access to the data
username = "**********"
password = "**********"
session = SessionWithHeaderRedirection(username, password)

# the url of the file we wish to retrieve
# and remove newline character from end of urls
with open('test.txt') as url_file:
    urls = [url.strip('\n') for url in list(url_file)]

for url in urls:
    # extract the filename from the url to be used when saving the file
    filename = url[url.rfind('/') + 1:]

    try:
        # submit the request using the session
        response = session.get(url, stream=True)
        print(response.status_code)
        # raise an exception in case of http errors
        response.raise_for_status()
        # save the file

        with open(filename, 'wb') as fd:
            for chunk in response.iter_content(chunk_size=1024 * 1024):
                fd.write(chunk)

    except requests.exceptions.HTTPError as e:
        # handle any errors here
        print(e)

This code was generate from https://urs.earthdata.nasa.gov/documentation/for_users/data_access/python

I have only slightly modified it to work with my project.

回复收藏 0 原文

~没有更多了~