获得最大的图像

发布于 2024-10-31 18:51:37 字数 18332 浏览 0 评论 0原文

我正在做一个图像搜索项目，我想获得一页最大的图像。我添加了一些代码来修复图像的真实地址，删除可能是广告的图像。比较宽度*高度，得出最大的一个。但我的代码有一些问题。这是我的整个代码。谁能帮助我修复错误的地方以及如何优化代码，我觉得这个过程很痛苦缓慢。感谢大家。

<?php
require_once 'simple_html_dom.php';
require 'url_to_absolute.php'; //get image absolute url
$v = 'http://www.yomiuri.co.jp/stream/';
$html = file_get_html($v);
$maxsize = -1; 
$the_biggest_image = false;
$arr = array('ad', 'ads','gif');// add ads possible words as a arry which is check in the image url
foreach($html->find('img') as $element) {
    preg_match_all('#https?://(.*?)($|/)#m', urldecode(stripcslashes($v)), $r); //get site base url
    $pic = $element->src;
    $comm = url_to_absolute( $r[0][0], $pic);//get image absolute url
    $check_flag = true;
    foreach($arr as $item) {
        if (substr_count(strtolower($comm),$item) > 0) $check_flag = false;
    }// remove ads images
    if ($check_flag) $arr = @getimagesize($comm);// get the rest images width and height
    reset($comm);
        if (($arr[0] * $arr[1]) > $maxsize) {   
            $maxsize = $arr[0] * $arr[1];  //compare images' sise
            $the_biggest_image = $comm;
            echo '<img src="'.$the_biggest_image.'" />'; //echo the biggest one
        }
}
?>

url_to_absolute.php

<?php
/**
 * Edited by Nitin Kr. Gupta, publicmind.in
 */

/**
 * Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 *  * Redistributions in binary form must reproduce the above
 *    copyright notice, this list of conditions and the following
 *    disclaimer in the documentation and/or other materials provided
 *    with the distribution.
 *
 *  * Neither the names of David R. Nadeau or NadeauSoftware.com, nor
 *    the names of its contributors may be used to endorse or promote
 *    products derived from this software without specific prior
 *    written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
 * OF SUCH DAMAGE.
 */

/*
 * This is a BSD License approved by the Open Source Initiative (OSI).
 * See:  http://www.opensource.org/licenses/bsd-license.php
 */

/**
 * Combine a base URL and a relative URL to produce a new
 * absolute URL.  The base URL is often the URL of a page,
 * and the relative URL is a URL embedded on that page.
 *
 * This function implements the "absolutize" algorithm from
 * the RFC3986 specification for URLs.
 *
 * This function supports multi-byte characters with the UTF-8 encoding,
 * per the URL specification.
 *
 * Parameters:
 *  baseUrl     the absolute base URL.
 *
 *  url     the relative URL to convert.
 *
 * Return values:
 *  An absolute URL that combines parts of the base and relative
 *  URLs, or FALSE if the base URL is not absolute or if either
 *  URL cannot be parsed.
 */
function url_to_absolute( $baseUrl, $relativeUrl )
{
    // If relative URL has a scheme, clean path and return.
    $r = split_url( $relativeUrl );
    if ( $r === FALSE )
        return FALSE;
    if ( !empty( $r['scheme'] ) )
    {
        if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
            $r['path'] = url_remove_dot_segments( $r['path'] );
        return join_url( $r );
    }

    // Make sure the base URL is absolute.
    $b = split_url( $baseUrl );
    if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
        return FALSE;
    $r['scheme'] = $b['scheme'];

    // If relative URL has an authority, clean path and return.
    if ( isset( $r['host'] ) )
    {
        if ( !empty( $r['path'] ) )
            $r['path'] = url_remove_dot_segments( $r['path'] );
        return join_url( $r );
    }
    unset( $r['port'] );
    unset( $r['user'] );
    unset( $r['pass'] );

    // Copy base authority.
    $r['host'] = $b['host'];
    if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
    if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
    if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];

    // If relative URL has no path, use base path
    if ( empty( $r['path'] ) )
    {
        if ( !empty( $b['path'] ) )
            $r['path'] = $b['path'];
        if ( !isset( $r['query'] ) && isset( $b['query'] ) )
            $r['query'] = $b['query'];
        return join_url( $r );
    }

    // If relative URL path doesn't start with /, merge with base path
    if ( $r['path'][0] != '/' )
    {
        $base = mb_strrchr( $b['path'], '/', TRUE, 'UTF-8' );
        if ( $base === FALSE ) $base = '';
        $r['path'] = $base . '/' . $r['path'];
    }
    $r['path'] = url_remove_dot_segments( $r['path'] );
    return join_url( $r );
}

/**
 * Filter out "." and ".." segments from a URL's path and return
 * the result.
 *
 * This function implements the "remove_dot_segments" algorithm from
 * the RFC3986 specification for URLs.
 *
 * This function supports multi-byte characters with the UTF-8 encoding,
 * per the URL specification.
 *
 * Parameters:
 *  path    the path to filter
 *
 * Return values:
 *  The filtered path with "." and ".." removed.
 */
function url_remove_dot_segments( $path )
{
    // multi-byte character explode
    $inSegs  = preg_split( '!/!u', $path );
    $outSegs = array( );
    foreach ( $inSegs as $seg )
    {
        if ( $seg == '' || $seg == '.')
            continue;
        if ( $seg == '..' )
            array_pop( $outSegs );
        else
            array_push( $outSegs, $seg );
    }
    $outPath = implode( '/', $outSegs );
    if ( $path[0] == '/' )
        $outPath = '/' . $outPath;
    // compare last multi-byte character against '/'
    if ( $outPath != '/' &&
        (mb_strlen($path)-1) == mb_strrpos( $path, '/', 'UTF-8' ) )
        $outPath .= '/';
    return $outPath;
}


/**
 * This function parses an absolute or relative URL and splits it
 * into individual components.
 *
 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 * A portion of the ABNFs are repeated here:
 *
 *  URI-reference   = URI
 *          / relative-ref
 *
 *  URI     = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 *
 *  relative-ref    = relative-part [ "?" query ] [ "#" fragment ]
 *
 *  hier-part   = "//" authority path-abempty
 *          / path-absolute
 *          / path-rootless
 *          / path-empty
 *
 *  relative-part   = "//" authority path-abempty
 *          / path-absolute
 *          / path-noscheme
 *          / path-empty
 *
 *  authority   = [ userinfo "@" ] host [ ":" port ]
 *
 * So, a URL has the following major components:
 *
 *  scheme
 *      The name of a method used to interpret the rest of
 *      the URL.  Examples:  "http", "https", "mailto", "file'.
 *
 *  authority
 *      The name of the authority governing the URL's name
 *      space.  Examples:  "example.com", "[email protected]",
 *      "example.com:80", "user:[email protected]:80".
 *
 *      The authority may include a host name, port number,
 *      user name, and password.
 *
 *      The host may be a name, an IPv4 numeric address, or
 *      an IPv6 numeric address.
 *
 *  path
 *      The hierarchical path to the URL's resource.
 *      Examples:  "/index.htm", "/scripts/page.php".
 *
 *  query
 *      The data for a query.  Examples:  "?search=google.com".
 *
 *  fragment
 *      The name of a secondary resource relative to that named
 *      by the path.  Examples:  "#section1", "#header".
 *
 * An "absolute" URL must include a scheme and path.  The authority, query,
 * and fragment components are optional.
 *
 * A "relative" URL does not include a scheme and must include a path.  The
 * authority, query, and fragment components are optional.
 *
 * This function splits the $url argument into the following components
 * and returns them in an associative array.  Keys to that array include:
 *
 *  "scheme"    The scheme, such as "http".
 *  "host"      The host name, IPv4, or IPv6 address.
 *  "port"      The port number.
 *  "user"      The user name.
 *  "pass"      The user password.
 *  "path"      The path, such as a file path for "http".
 *  "query"     The query.
 *  "fragment"  The fragment.
 *
 * One or more of these may not be present, depending upon the URL.
 *
 * Optionally, the "user", "pass", "host" (if a name, not an IP address),
 * "path", "query", and "fragment" may have percent-encoded characters
 * decoded.  The "scheme" and "port" cannot include percent-encoded
 * characters and are never decoded.  Decoding occurs after the URL has
 * been parsed.
 *
 * Parameters:
 *  url     the URL to parse.
 *
 *  decode      an optional boolean flag selecting whether
 *          to decode percent encoding or not.  Default = TRUE.
 *
 * Return values:
 *  the associative array of URL parts, or FALSE if the URL is
 *  too malformed to recognize any parts.
 */
function split_url( $url, $decode=FALSE)
{
    // Character sets from RFC3986.
    $xunressub     = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
    $xpchar        = $xunressub . ':@% ';

    // Scheme from RFC3986.
    $xscheme        = '([a-zA-Z][a-zA-Z\d+-.]*)';

    // User info (user + password) from RFC3986.
    $xuserinfo     = '((['  . $xunressub . '%]*)' .
                     '(:([' . $xunressub . ':%]*))?)';

    // IPv4 from RFC3986 (without digit constraints).
    $xipv4         = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';

    // IPv6 from RFC2732 (without digit and grouping constraints).
    $xipv6         = '(\[([a-fA-F\d.:]+)\])';

    // Host name from RFC1035.  Technically, must start with a letter.
    // Relax that restriction to better parse URL structure, then
    // leave host name validation to application.
    $xhost_name    = '([a-zA-Z\d-.%]+)';

    // Authority from RFC3986.  Skip IP future.
    $xhost         = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
    $xport         = '(\d*)';
    $xauthority    = '((' . $xuserinfo . '@)?' . $xhost .
                 '?(:' . $xport . ')?)';

    // Path from RFC3986.  Blend absolute & relative for efficiency.
    $xslash_seg    = '(/[' . $xpchar . ']*)';
    $xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
    $xpath_rel     = '([' . $xpchar . ']+' . $xslash_seg . '*)';
    $xpath_abs     = '(/(' . $xpath_rel . ')?)';
    $xapath        = '(' . $xpath_authabs . '|' . $xpath_abs .
             '|' . $xpath_rel . ')';

    // Query and fragment from RFC3986.
    $xqueryfrag    = '([' . $xpchar . '/?' . ']*)';

    // URL.
    $xurl          = '^(' . $xscheme . ':)?' .  $xapath . '?' .
                     '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';


    // Split the URL into components.
    if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
        return FALSE;

    if ( !empty($m[2]) )        $parts['scheme']  = strtolower($m[2]);

    if ( !empty($m[7]) ) {
        if ( isset( $m[9] ) )   $parts['user']    = $m[9];
        else            $parts['user']    = '';
    }
    if ( !empty($m[10]) )       $parts['pass']    = $m[11];

    if ( !empty($m[13]) )       $h=$parts['host'] = $m[13];
    else if ( !empty($m[14]) )  $parts['host']    = $m[14];
    else if ( !empty($m[16]) )  $parts['host']    = $m[16];
    else if ( !empty( $m[5] ) ) $parts['host']    = '';
    if ( !empty($m[17]) )       $parts['port']    = $m[18];

    if ( !empty($m[19]) )       $parts['path']    = $m[19];
    else if ( !empty($m[21]) )  $parts['path']    = $m[21];
    else if ( !empty($m[25]) )  $parts['path']    = $m[25];

    if ( !empty($m[27]) )       $parts['query']   = $m[28];
    if ( !empty($m[29]) )       $parts['fragment']= $m[30];

    if ( !$decode )
        return $parts;
    if ( !empty($parts['user']) )
        $parts['user']     = rawurldecode( $parts['user'] );
    if ( !empty($parts['pass']) )
        $parts['pass']     = rawurldecode( $parts['pass'] );
    if ( !empty($parts['path']) )
        $parts['path']     = rawurldecode( $parts['path'] );
    if ( isset($h) )
        $parts['host']     = rawurldecode( $parts['host'] );
    if ( !empty($parts['query']) )
        $parts['query']    = rawurldecode( $parts['query'] );
    if ( !empty($parts['fragment']) )
        $parts['fragment'] = rawurldecode( $parts['fragment'] );
    return $parts;
}


/**
 * This function joins together URL components to form a complete URL.
 *
 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 * This function implements the specification's "component recomposition"
 * algorithm for combining URI components into a full URI string.
 *
 * The $parts argument is an associative array containing zero or
 * more of the following:
 *
 *  "scheme"    The scheme, such as "http".
 *  "host"      The host name, IPv4, or IPv6 address.
 *  "port"      The port number.
 *  "user"      The user name.
 *  "pass"      The user password.
 *  "path"      The path, such as a file path for "http".
 *  "query"     The query.
 *  "fragment"  The fragment.
 *
 * The "port", "user", and "pass" values are only used when a "host"
 * is present.
 *
 * The optional $encode argument indicates if appropriate URL components
 * should be percent-encoded as they are assembled into the URL.  Encoding
 * is only applied to the "user", "pass", "host" (if a host name, not an
 * IP address), "path", "query", and "fragment" components.  The "scheme"
 * and "port" are never encoded.  When a "scheme" and "host" are both
 * present, the "path" is presumed to be hierarchical and encoding
 * processes each segment of the hierarchy separately (i.e., the slashes
 * are left alone).
 *
 * The assembled URL string is returned.
 *
 * Parameters:
 *  parts       an associative array of strings containing the
 *          individual parts of a URL.
 *
 *  encode      an optional boolean flag selecting whether
 *          to do percent encoding or not.  Default = true.
 *
 * Return values:
 *  Returns the assembled URL string.  The string is an absolute
 *  URL if a scheme is supplied, and a relative URL if not.  An
 *  empty string is returned if the $parts array does not contain
 *  any of the needed values.
 */
function join_url( $parts, $encode=FALSE)
{
    if ( $encode )
    {
        if ( isset( $parts['user'] ) )
            $parts['user']     = rawurlencode( $parts['user'] );
        if ( isset( $parts['pass'] ) )
            $parts['pass']     = rawurlencode( $parts['pass'] );
        if ( isset( $parts['host'] ) &&
            !preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
            $parts['host']     = rawurlencode( $parts['host'] );
        if ( !empty( $parts['path'] ) )
            $parts['path']     = preg_replace( '!%2F!ui', '/',
                rawurlencode( $parts['path'] ) );
        if ( isset( $parts['query'] ) )
            $parts['query']    = rawurlencode( $parts['query'] );
        if ( isset( $parts['fragment'] ) )
            $parts['fragment'] = rawurlencode( $parts['fragment'] );
    }

    $url = '';
    if ( !empty( $parts['scheme'] ) )
        $url .= $parts['scheme'] . ':';
    if ( isset( $parts['host'] ) )
    {
        $url .= '//';
        if ( isset( $parts['user'] ) )
        {
            $url .= $parts['user'];
            if ( isset( $parts['pass'] ) )
                $url .= ':' . $parts['pass'];
            $url .= '@';
        }
        if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
            $url .= '[' . $parts['host'] . ']'; // IPv6
        else
            $url .= $parts['host'];         // IPv4 or name
        if ( isset( $parts['port'] ) )
            $url .= ':' . $parts['port'];
        if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
            $url .= '/';
    }
    if ( !empty( $parts['path'] ) )
        $url .= $parts['path'];
    if ( isset( $parts['query'] ) )
        $url .= '?' . $parts['query'];
    if ( isset( $parts['fragment'] ) )
        $url .= '#' . $parts['fragment'];
    return $url;
}

/**
 * This function encodes URL to form a URL which is properly 
 * percent encoded to replace disallowed characters.
 *
 * RFC3986 specifies the allowed characters in the URL as well as
 * reserved characters in the URL. This function replaces all the 
 * disallowed characters in the URL with their repective percent 
 * encodings. Already encoded characters are not encoded again,
 * such as '%20' is not encoded to '%2520'.
 *
 * Parameters:
 *  url     the url to encode.
 *
 * Return values:
 *  Returns the encoded URL string. 
 */
function encode_url($url) {
  $reserved = array(
    ":" => '!%3A!ui',
    "/" => '!%2F!ui',
    "?" => '!%3F!ui',
    "#" => '!%23!ui',
    "[" => '!%5B!ui',
    "]" => '!%5D!ui',
    "@" => '!%40!ui',
    "!" => '!%21!ui',
    "$" => '!%24!ui',
    "&" => '!%26!ui',
    "'" => '!%27!ui',
    "(" => '!%28!ui',
    ")" => '!%29!ui',
    "*" => '!%2A!ui',
    "+" => '!%2B!ui',
    "," => '!%2C!ui',
    ";" => '!%3B!ui',
    "=" => '!%3D!ui',
    "%" => '!%25!ui',
  );

  $url = rawurlencode($url);
  $url = preg_replace(array_values($reserved), array_keys($reserved), $url);
  return $url;
}

?>

原文

I am making an image search project, and I want to get one page's biggest images.
I added some code to fix the images real address, remove images which are possible ads. the compare there width*height echo out the biggest one. but my code has some problem. here is my whole code. Can anyone help me to fix where is wrong and how to optimize the code, I feel the process is a bitter slowly. Thanks to all.

<?php
require_once 'simple_html_dom.php';
require 'url_to_absolute.php'; //get image absolute url
$v = 'http://www.yomiuri.co.jp/stream/';
$html = file_get_html($v);
$maxsize = -1; 
$the_biggest_image = false;
$arr = array('ad', 'ads','gif');// add ads possible words as a arry which is check in the image url
foreach($html->find('img') as $element) {
    preg_match_all('#https?://(.*?)($|/)#m', urldecode(stripcslashes($v)), $r); //get site base url
    $pic = $element->src;
    $comm = url_to_absolute( $r[0][0], $pic);//get image absolute url
    $check_flag = true;
    foreach($arr as $item) {
        if (substr_count(strtolower($comm),$item) > 0) $check_flag = false;
    }// remove ads images
    if ($check_flag) $arr = @getimagesize($comm);// get the rest images width and height
    reset($comm);
        if (($arr[0] * $arr[1]) > $maxsize) {   
            $maxsize = $arr[0] * $arr[1];  //compare images' sise
            $the_biggest_image = $comm;
            echo '<img src="'.$the_biggest_image.'" />'; //echo the biggest one
        }
}
?>

url_to_absolute.php

<?php
/**
 * Edited by Nitin Kr. Gupta, publicmind.in
 */

/**
 * Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 *  * Redistributions in binary form must reproduce the above
 *    copyright notice, this list of conditions and the following
 *    disclaimer in the documentation and/or other materials provided
 *    with the distribution.
 *
 *  * Neither the names of David R. Nadeau or NadeauSoftware.com, nor
 *    the names of its contributors may be used to endorse or promote
 *    products derived from this software without specific prior
 *    written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
 * OF SUCH DAMAGE.
 */

/*
 * This is a BSD License approved by the Open Source Initiative (OSI).
 * See:  http://www.opensource.org/licenses/bsd-license.php
 */

/**
 * Combine a base URL and a relative URL to produce a new
 * absolute URL.  The base URL is often the URL of a page,
 * and the relative URL is a URL embedded on that page.
 *
 * This function implements the "absolutize" algorithm from
 * the RFC3986 specification for URLs.
 *
 * This function supports multi-byte characters with the UTF-8 encoding,
 * per the URL specification.
 *
 * Parameters:
 *  baseUrl     the absolute base URL.
 *
 *  url     the relative URL to convert.
 *
 * Return values:
 *  An absolute URL that combines parts of the base and relative
 *  URLs, or FALSE if the base URL is not absolute or if either
 *  URL cannot be parsed.
 */
function url_to_absolute( $baseUrl, $relativeUrl )
{
    // If relative URL has a scheme, clean path and return.
    $r = split_url( $relativeUrl );
    if ( $r === FALSE )
        return FALSE;
    if ( !empty( $r['scheme'] ) )
    {
        if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
            $r['path'] = url_remove_dot_segments( $r['path'] );
        return join_url( $r );
    }

    // Make sure the base URL is absolute.
    $b = split_url( $baseUrl );
    if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
        return FALSE;
    $r['scheme'] = $b['scheme'];

    // If relative URL has an authority, clean path and return.
    if ( isset( $r['host'] ) )
    {
        if ( !empty( $r['path'] ) )
            $r['path'] = url_remove_dot_segments( $r['path'] );
        return join_url( $r );
    }
    unset( $r['port'] );
    unset( $r['user'] );
    unset( $r['pass'] );

    // Copy base authority.
    $r['host'] = $b['host'];
    if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
    if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
    if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];

    // If relative URL has no path, use base path
    if ( empty( $r['path'] ) )
    {
        if ( !empty( $b['path'] ) )
            $r['path'] = $b['path'];
        if ( !isset( $r['query'] ) && isset( $b['query'] ) )
            $r['query'] = $b['query'];
        return join_url( $r );
    }

    // If relative URL path doesn't start with /, merge with base path
    if ( $r['path'][0] != '/' )
    {
        $base = mb_strrchr( $b['path'], '/', TRUE, 'UTF-8' );
        if ( $base === FALSE ) $base = '';
        $r['path'] = $base . '/' . $r['path'];
    }
    $r['path'] = url_remove_dot_segments( $r['path'] );
    return join_url( $r );
}

/**
 * Filter out "." and ".." segments from a URL's path and return
 * the result.
 *
 * This function implements the "remove_dot_segments" algorithm from
 * the RFC3986 specification for URLs.
 *
 * This function supports multi-byte characters with the UTF-8 encoding,
 * per the URL specification.
 *
 * Parameters:
 *  path    the path to filter
 *
 * Return values:
 *  The filtered path with "." and ".." removed.
 */
function url_remove_dot_segments( $path )
{
    // multi-byte character explode
    $inSegs  = preg_split( '!/!u', $path );
    $outSegs = array( );
    foreach ( $inSegs as $seg )
    {
        if ( $seg == '' || $seg == '.')
            continue;
        if ( $seg == '..' )
            array_pop( $outSegs );
        else
            array_push( $outSegs, $seg );
    }
    $outPath = implode( '/', $outSegs );
    if ( $path[0] == '/' )
        $outPath = '/' . $outPath;
    // compare last multi-byte character against '/'
    if ( $outPath != '/' &&
        (mb_strlen($path)-1) == mb_strrpos( $path, '/', 'UTF-8' ) )
        $outPath .= '/';
    return $outPath;
}


/**
 * This function parses an absolute or relative URL and splits it
 * into individual components.
 *
 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 * A portion of the ABNFs are repeated here:
 *
 *  URI-reference   = URI
 *          / relative-ref
 *
 *  URI     = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 *
 *  relative-ref    = relative-part [ "?" query ] [ "#" fragment ]
 *
 *  hier-part   = "//" authority path-abempty
 *          / path-absolute
 *          / path-rootless
 *          / path-empty
 *
 *  relative-part   = "//" authority path-abempty
 *          / path-absolute
 *          / path-noscheme
 *          / path-empty
 *
 *  authority   = [ userinfo "@" ] host [ ":" port ]
 *
 * So, a URL has the following major components:
 *
 *  scheme
 *      The name of a method used to interpret the rest of
 *      the URL.  Examples:  "http", "https", "mailto", "file'.
 *
 *  authority
 *      The name of the authority governing the URL's name
 *      space.  Examples:  "example.com", "[email protected]",
 *      "example.com:80", "user:[email protected]:80".
 *
 *      The authority may include a host name, port number,
 *      user name, and password.
 *
 *      The host may be a name, an IPv4 numeric address, or
 *      an IPv6 numeric address.
 *
 *  path
 *      The hierarchical path to the URL's resource.
 *      Examples:  "/index.htm", "/scripts/page.php".
 *
 *  query
 *      The data for a query.  Examples:  "?search=google.com".
 *
 *  fragment
 *      The name of a secondary resource relative to that named
 *      by the path.  Examples:  "#section1", "#header".
 *
 * An "absolute" URL must include a scheme and path.  The authority, query,
 * and fragment components are optional.
 *
 * A "relative" URL does not include a scheme and must include a path.  The
 * authority, query, and fragment components are optional.
 *
 * This function splits the $url argument into the following components
 * and returns them in an associative array.  Keys to that array include:
 *
 *  "scheme"    The scheme, such as "http".
 *  "host"      The host name, IPv4, or IPv6 address.
 *  "port"      The port number.
 *  "user"      The user name.
 *  "pass"      The user password.
 *  "path"      The path, such as a file path for "http".
 *  "query"     The query.
 *  "fragment"  The fragment.
 *
 * One or more of these may not be present, depending upon the URL.
 *
 * Optionally, the "user", "pass", "host" (if a name, not an IP address),
 * "path", "query", and "fragment" may have percent-encoded characters
 * decoded.  The "scheme" and "port" cannot include percent-encoded
 * characters and are never decoded.  Decoding occurs after the URL has
 * been parsed.
 *
 * Parameters:
 *  url     the URL to parse.
 *
 *  decode      an optional boolean flag selecting whether
 *          to decode percent encoding or not.  Default = TRUE.
 *
 * Return values:
 *  the associative array of URL parts, or FALSE if the URL is
 *  too malformed to recognize any parts.
 */
function split_url( $url, $decode=FALSE)
{
    // Character sets from RFC3986.
    $xunressub     = 'a-zA-Z\d\-._~\!amp;\'()*+,;=';
    $xpchar        = $xunressub . ':@% ';

    // Scheme from RFC3986.
    $xscheme        = '([a-zA-Z][a-zA-Z\d+-.]*)';

    // User info (user + password) from RFC3986.
    $xuserinfo     = '((['  . $xunressub . '%]*)' .
                     '(:([' . $xunressub . ':%]*))?)';

    // IPv4 from RFC3986 (without digit constraints).
    $xipv4         = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';

    // IPv6 from RFC2732 (without digit and grouping constraints).
    $xipv6         = '(\[([a-fA-F\d.:]+)\])';

    // Host name from RFC1035.  Technically, must start with a letter.
    // Relax that restriction to better parse URL structure, then
    // leave host name validation to application.
    $xhost_name    = '([a-zA-Z\d-.%]+)';

    // Authority from RFC3986.  Skip IP future.
    $xhost         = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
    $xport         = '(\d*)';
    $xauthority    = '((' . $xuserinfo . '@)?' . $xhost .
                 '?(:' . $xport . ')?)';

    // Path from RFC3986.  Blend absolute & relative for efficiency.
    $xslash_seg    = '(/[' . $xpchar . ']*)';
    $xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
    $xpath_rel     = '([' . $xpchar . ']+' . $xslash_seg . '*)';
    $xpath_abs     = '(/(' . $xpath_rel . ')?)';
    $xapath        = '(' . $xpath_authabs . '|' . $xpath_abs .
             '|' . $xpath_rel . ')';

    // Query and fragment from RFC3986.
    $xqueryfrag    = '([' . $xpchar . '/?' . ']*)';

    // URL.
    $xurl          = '^(' . $xscheme . ':)?' .  $xapath . '?' .
                     '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?
;


    // Split the URL into components.
    if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
        return FALSE;

    if ( !empty($m[2]) )        $parts['scheme']  = strtolower($m[2]);

    if ( !empty($m[7]) ) {
        if ( isset( $m[9] ) )   $parts['user']    = $m[9];
        else            $parts['user']    = '';
    }
    if ( !empty($m[10]) )       $parts['pass']    = $m[11];

    if ( !empty($m[13]) )       $h=$parts['host'] = $m[13];
    else if ( !empty($m[14]) )  $parts['host']    = $m[14];
    else if ( !empty($m[16]) )  $parts['host']    = $m[16];
    else if ( !empty( $m[5] ) ) $parts['host']    = '';
    if ( !empty($m[17]) )       $parts['port']    = $m[18];

    if ( !empty($m[19]) )       $parts['path']    = $m[19];
    else if ( !empty($m[21]) )  $parts['path']    = $m[21];
    else if ( !empty($m[25]) )  $parts['path']    = $m[25];

    if ( !empty($m[27]) )       $parts['query']   = $m[28];
    if ( !empty($m[29]) )       $parts['fragment']= $m[30];

    if ( !$decode )
        return $parts;
    if ( !empty($parts['user']) )
        $parts['user']     = rawurldecode( $parts['user'] );
    if ( !empty($parts['pass']) )
        $parts['pass']     = rawurldecode( $parts['pass'] );
    if ( !empty($parts['path']) )
        $parts['path']     = rawurldecode( $parts['path'] );
    if ( isset($h) )
        $parts['host']     = rawurldecode( $parts['host'] );
    if ( !empty($parts['query']) )
        $parts['query']    = rawurldecode( $parts['query'] );
    if ( !empty($parts['fragment']) )
        $parts['fragment'] = rawurldecode( $parts['fragment'] );
    return $parts;
}


/**
 * This function joins together URL components to form a complete URL.
 *
 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 * This function implements the specification's "component recomposition"
 * algorithm for combining URI components into a full URI string.
 *
 * The $parts argument is an associative array containing zero or
 * more of the following:
 *
 *  "scheme"    The scheme, such as "http".
 *  "host"      The host name, IPv4, or IPv6 address.
 *  "port"      The port number.
 *  "user"      The user name.
 *  "pass"      The user password.
 *  "path"      The path, such as a file path for "http".
 *  "query"     The query.
 *  "fragment"  The fragment.
 *
 * The "port", "user", and "pass" values are only used when a "host"
 * is present.
 *
 * The optional $encode argument indicates if appropriate URL components
 * should be percent-encoded as they are assembled into the URL.  Encoding
 * is only applied to the "user", "pass", "host" (if a host name, not an
 * IP address), "path", "query", and "fragment" components.  The "scheme"
 * and "port" are never encoded.  When a "scheme" and "host" are both
 * present, the "path" is presumed to be hierarchical and encoding
 * processes each segment of the hierarchy separately (i.e., the slashes
 * are left alone).
 *
 * The assembled URL string is returned.
 *
 * Parameters:
 *  parts       an associative array of strings containing the
 *          individual parts of a URL.
 *
 *  encode      an optional boolean flag selecting whether
 *          to do percent encoding or not.  Default = true.
 *
 * Return values:
 *  Returns the assembled URL string.  The string is an absolute
 *  URL if a scheme is supplied, and a relative URL if not.  An
 *  empty string is returned if the $parts array does not contain
 *  any of the needed values.
 */
function join_url( $parts, $encode=FALSE)
{
    if ( $encode )
    {
        if ( isset( $parts['user'] ) )
            $parts['user']     = rawurlencode( $parts['user'] );
        if ( isset( $parts['pass'] ) )
            $parts['pass']     = rawurlencode( $parts['pass'] );
        if ( isset( $parts['host'] ) &&
            !preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
            $parts['host']     = rawurlencode( $parts['host'] );
        if ( !empty( $parts['path'] ) )
            $parts['path']     = preg_replace( '!%2F!ui', '/',
                rawurlencode( $parts['path'] ) );
        if ( isset( $parts['query'] ) )
            $parts['query']    = rawurlencode( $parts['query'] );
        if ( isset( $parts['fragment'] ) )
            $parts['fragment'] = rawurlencode( $parts['fragment'] );
    }

    $url = '';
    if ( !empty( $parts['scheme'] ) )
        $url .= $parts['scheme'] . ':';
    if ( isset( $parts['host'] ) )
    {
        $url .= '//';
        if ( isset( $parts['user'] ) )
        {
            $url .= $parts['user'];
            if ( isset( $parts['pass'] ) )
                $url .= ':' . $parts['pass'];
            $url .= '@';
        }
        if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
            $url .= '[' . $parts['host'] . ']'; // IPv6
        else
            $url .= $parts['host'];         // IPv4 or name
        if ( isset( $parts['port'] ) )
            $url .= ':' . $parts['port'];
        if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
            $url .= '/';
    }
    if ( !empty( $parts['path'] ) )
        $url .= $parts['path'];
    if ( isset( $parts['query'] ) )
        $url .= '?' . $parts['query'];
    if ( isset( $parts['fragment'] ) )
        $url .= '#' . $parts['fragment'];
    return $url;
}

/**
 * This function encodes URL to form a URL which is properly 
 * percent encoded to replace disallowed characters.
 *
 * RFC3986 specifies the allowed characters in the URL as well as
 * reserved characters in the URL. This function replaces all the 
 * disallowed characters in the URL with their repective percent 
 * encodings. Already encoded characters are not encoded again,
 * such as '%20' is not encoded to '%2520'.
 *
 * Parameters:
 *  url     the url to encode.
 *
 * Return values:
 *  Returns the encoded URL string. 
 */
function encode_url($url) {
  $reserved = array(
    ":" => '!%3A!ui',
    "/" => '!%2F!ui',
    "?" => '!%3F!ui',
    "#" => '!%23!ui',
    "[" => '!%5B!ui',
    "]" => '!%5D!ui',
    "@" => '!%40!ui',
    "!" => '!%21!ui',
    "$" => '!%24!ui',
    "&" => '!%26!ui',
    "'" => '!%27!ui',
    "(" => '!%28!ui',
    ")" => '!%29!ui',
    "*" => '!%2A!ui',
    "+" => '!%2B!ui',
    "," => '!%2C!ui',
    ";" => '!%3B!ui',
    "=" => '!%3D!ui',
    "%" => '!%25!ui',
  );

  $url = rawurlencode($url);
  $url = preg_replace(array_values($reserved), array_keys($reserved), $url);
  return $url;
}

?>

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

逆夏时光 2024-11-07 18:51:37

您还没有真正说出您遇到的错误，但幸运的是您的代码中有一些错误。可能会给您带来错误的部分位于此块中：

if ($check_flag) $arr = @getimagesize($comm);// get the rest images width and height
reset($comm);
if (($arr[0] * $arr[1]) > $maxsize) {   
    $maxsize = $arr[0] * $arr[1];  //compare images' sise
    $the_biggest_image = $comm;
    echo '<img src="'.$the_biggest_image.'" />'; //echo the biggest one
}

您重写 $arr = @getimagesize($comm); 这是您的“广告”过滤变量。
如果 $check_flag 为 false，您仍然执行以下计算语句
reset() 不适用于字符串。
每当您更新最大大小时，您都会回显 $the_biggest_image 。这是有意的吗？

更新

尝试让你的代码工作并希望稍微好一点：

<?php
require_once 'simple_html_dom.php';
require 'url_to_absolute.php'; //get image absolute url
// options
$url = 'http://www.yomiuri.co.jp/stream/';
$ignore = array('ad', 'ads','gif');// add ads possible words as a arry which is check in the image url
$biggestImage = 'path to "no image found" image';
// process
$maxSize = -1;
$visited = array();
$html = file_get_html($url);
// base url
$parts=parse_url($url);
$host=$parts['scheme'].'://'.$parts['host'];
// loop images
foreach($html->find('img') as $element) {
    $pic = $element->src;
    if($pic=='')continue;// it happens on your test url
    $absUrl = url_to_absolute($host, $pic);//get image absolute url
    // ignore already seen images, add new images
    if(in_array($absUrl, $visited))continue;
    $visited[]=$absUrl;
    // remove ads images
    $ignoring=false;
    foreach($ignore as $item)
        if (stripos($absUrl,$item)!==false){
            $ignoring=true;
            break;
        }
    if($ignoring)continue;
    // get image
    $image=@getimagesize($absUrl);// get the rest images width and height
    if (($image[0] * $image[1]) > $maxSize) {   
        $maxSize = $image[0] * $image[1];  //compare images' sise
        $biggestImage = $absUrl;
    }
}
echo '<img src="'.$biggestImage.'" />'; //echo the biggest one
?>

You haven't really said what error you have but luckily you've got a couple of errors in your code. The ones that might be giving you errors are in this block:

if ($check_flag) $arr = @getimagesize($comm);// get the rest images width and height
reset($comm);
if (($arr[0] * $arr[1]) > $maxsize) {   
    $maxsize = $arr[0] * $arr[1];  //compare images' sise
    $the_biggest_image = $comm;
    echo '<img src="'.$the_biggest_image.'" />'; //echo the biggest one
}

You rewrite $arr = @getimagesize($comm); which is your "ads" filtering variable.
If $check_flag is false, you still do the following computational statements
reset() does not work on strings.
You echo $the_biggest_image whenever you update max size. is that intended?

UPDATE

Attempt at making your code work and hopefully slightly better:

<?php
require_once 'simple_html_dom.php';
require 'url_to_absolute.php'; //get image absolute url
// options
$url = 'http://www.yomiuri.co.jp/stream/';
$ignore = array('ad', 'ads','gif');// add ads possible words as a arry which is check in the image url
$biggestImage = 'path to "no image found" image';
// process
$maxSize = -1;
$visited = array();
$html = file_get_html($url);
// base url
$parts=parse_url($url);
$host=$parts['scheme'].'://'.$parts['host'];
// loop images
foreach($html->find('img') as $element) {
    $pic = $element->src;
    if($pic=='')continue;// it happens on your test url
    $absUrl = url_to_absolute($host, $pic);//get image absolute url
    // ignore already seen images, add new images
    if(in_array($absUrl, $visited))continue;
    $visited[]=$absUrl;
    // remove ads images
    $ignoring=false;
    foreach($ignore as $item)
        if (stripos($absUrl,$item)!==false){
            $ignoring=true;
            break;
        }
    if($ignoring)continue;
    // get image
    $image=@getimagesize($absUrl);// get the rest images width and height
    if (($image[0] * $image[1]) > $maxSize) {   
        $maxSize = $image[0] * $image[1];  //compare images' sise
        $biggestImage = $absUrl;
    }
}
echo '<img src="'.$biggestImage.'" />'; //echo the biggest one
?>

回复收藏 0 原文

叹倦 2024-11-07 18:51:37

根据您的代码，我创建了以下解决方案 - 它使用相同的逻辑，它允许您设置图像的最小宽度和高度，以确保它返回正确的图像

private function getMainImageFromUrl($pageUrl) {

    $biggestImage = '';
    $minImgWidth = 300;
    $minImgHeight = 300;
    $images = $this->getImagesFromDom($pageUrl);
    $visited = array();
    $maxSize = -1;
    $ignore = array('ad', 'ads','gif'); // get rid of ads (check if these contains following)

    foreach ($images as $image) {
        $pic = $image->getAttribute('src');
        # if source is empty, skip to another image
        if ( empty( $pic ) )
            continue;
        # get image absolute url
        $absUrl = url_to_absolute($pic);
        # ignore already seen images (skip to another), add new images
        if ( in_array( $absUrl, $visited ) )
            continue;
        $visited[] = $absUrl;
        # remove ads
        $ignoring = false;
        foreach($ignore as $item)
            if ( stripos( $absUrl,$item ) !== false ){

                $ignoring=true;
                break;

            }
        if ( $ignoring )
            continue;
        $imageSize = @getimagesize($absUrl);
        if ( ( $imageSize[0] * $imageSize[1] ) > $maxSize) {
            $maxSize = $imageSize[0] * $imageSize[1];
            if ($minImgWidth < $imageSize[0] && $minImgHeight < $imageSize[1])
                $biggestImage = $absUrl;
        }
    }
    return $biggestImage;
}

private function getImagesFromDom( $url ) {
    ini_set('default_socket_timeout', 4);
    $dom = new DOMDocument();
    @$dom->loadHTMLFile( $url );
    $dom->preserveWhiteSpace = false;

    # Get images from DOM
    return $dom->getElementsByTagName('img');
}

Based on your code I have created following solution - it uses the same logic and it lets you to set min width and height for image in order to make sure it's returning the right images

private function getMainImageFromUrl($pageUrl) {

    $biggestImage = '';
    $minImgWidth = 300;
    $minImgHeight = 300;
    $images = $this->getImagesFromDom($pageUrl);
    $visited = array();
    $maxSize = -1;
    $ignore = array('ad', 'ads','gif'); // get rid of ads (check if these contains following)

    foreach ($images as $image) {
        $pic = $image->getAttribute('src');
        # if source is empty, skip to another image
        if ( empty( $pic ) )
            continue;
        # get image absolute url
        $absUrl = url_to_absolute($pic);
        # ignore already seen images (skip to another), add new images
        if ( in_array( $absUrl, $visited ) )
            continue;
        $visited[] = $absUrl;
        # remove ads
        $ignoring = false;
        foreach($ignore as $item)
            if ( stripos( $absUrl,$item ) !== false ){

                $ignoring=true;
                break;

            }
        if ( $ignoring )
            continue;
        $imageSize = @getimagesize($absUrl);
        if ( ( $imageSize[0] * $imageSize[1] ) > $maxSize) {
            $maxSize = $imageSize[0] * $imageSize[1];
            if ($minImgWidth < $imageSize[0] && $minImgHeight < $imageSize[1])
                $biggestImage = $absUrl;
        }
    }
    return $biggestImage;
}

private function getImagesFromDom( $url ) {
    ini_set('default_socket_timeout', 4);
    $dom = new DOMDocument();
    @$dom->loadHTMLFile( $url );
    $dom->preserveWhiteSpace = false;

    # Get images from DOM
    return $dom->getElementsByTagName('img');
}

回复收藏 0 原文

~没有更多了~