为什么我的脚本没有处理数组中的所有元素？

发布于 2024-09-24 21:17:49 字数 4828 浏览 0 评论 0原文

以下代码是一个测试，用于测试我已经使用新发现的线程知识完成的操作。

#!/usr/bin/perl
use strict;
use warnings;
use threads;
use threads::shared;
use URI;
use URI::http;
use File::Basename;
use DBI;
use HTML::Parser;
use LWP::Simple;
require LWP::UserAgent;
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->max_redirect(0);

print "Starting main program\n";

my @urls = ('http://www.actwebdesigns.co.uk', 'http://www.1st4pets.com', 'http://www.special4you.com');
my @threads;
while ( @urls ) {
        my $url = shift ( @urls );
        my $t = threads->new(\&scan, $url);
        push(@threads,$t);
}
while (@threads) {
        my $url_thread = shift(@threads)->join;
}
sub resolve_href {
    my ($base, $href) = @_;
    my $u = URI->new_abs($href, $base);
    return $u->canonical;   
}
sub redirect_test {
    my $url = shift;
    my $redirect_limit = 10;
    my $y = 0;
    my( $response, $responseCode );
    while( 1 && $y le $redirect_limit ) {
        $response = $ua->get($url);
        $responseCode = $response->code;
        if( $responseCode == 200 || $responseCode == 301 || $responseCode == 302 ) {
            if( $responseCode == 301 || $responseCode == 302 ) {
                $url = resolve_href( $url, $response->header('Location') );
            }else{
                last;
            }
        }else{
            last;
        }
        $y++;
    }
    return ($url, $response, $responseCode, $redirect_limit, $y );
}
sub scan {
        my $url = shift;
        my @hrefs_found;
        print "started scanning: $url\n";
        my $info = URI::http->new($url);
        # if url is not an absolute url
        if( ! defined( $info->host ) ) {
            print "Invalid URL: $url \n";    
        }else{
             my $host = $info->host;
            $host =~ s/^www\.//;
            # check to see if url is valid, checks for redirects (max of 10)
            my @urlI = redirect_test( $url );
            my $content = '';
            # checks to see if url did not redirect more than 10 times and that response returned was 200
            if( $urlI[4] != $urlI[3] && $urlI[2] == 200 ) { 
                $content = $urlI[1]->content;
                die "get failed: " . $urlI[0] if ( ! defined $content );
            }
            # sticks all hrefs on a page in an array
            my @pageLinksArray = ( $content =~ m/href=["']([^"']*)["']/g );
            # foreach links found
            foreach( @pageLinksArray ) {
                # make href an absolute url
                my $url_found = resolve_href( $urlI[0], $_ );
                # check if url looks like a valid url
                if( $url_found =~ m/^http:\/\// ) {
                    my $info = URI::http->new($url_found);
                    # check to see if url is a valid url
                    if( ! defined( $info->host ) ) {
                        print "Invalid URL: $url_found \n";    
                    }else{
                        my %values_index;
                        @values_index{@hrefs_found} = ();
                        my %values_index2;
                        @values_index2{@urls} = ();
                        # if url is not already been found
                        if( ! exists $values_index{$url_found} && ! exists $values_index2{$url_found} ) {
                            # add to arrays
                            push( @hrefs_found, $url_found );
                            push( @urls, $url_found );
                        }
                    }
                }
            }
            print "$url found " . scalar @hrefs_found . "\n";

        }
        return $url;
}

问题是，在脚本末尾附近，新找到的 url 被添加到数组中，但脚本顶部的代码没有处理它们，即它只遍历第一个测试 url。

谁能明白为什么会发生这种情况？

问候，

菲尔

编辑 **

我尝试通过执行以下操作来暂停它：

while ( @urls ) {
my $url = shift ( @urls );
my $t = threads->new(\&scan, $url);
push(@threads,$t);
my $n = 0;
while( 1 ) {
    if( scalar @urls == 1 ) {
        sleep 10;
    }else{
        last;
    }
    if( $n >= 1 ) {
        print "IN ARRAY URLS:\n\n";
        print @urls;
        print "\n\n";
        die "Process taking too long.";
        last;
    }
    $n++;
}

}

但它没有似乎什么也没做。

结果是：

Starting main program
started scanning: http://www.actwebdesigns.co.uk
started scanning: http://www.1st4pets.com
http://www.actwebdesigns.co.uk found 24
http://www.1st4pets.com found 17
IN ARRAY URLS:

http://www.stackoverflow.com

Process taking too long. at C:\perlscripts\thread.pl line 38.
Perl exited with active threads:
        0 running and unjoined
        2 finished and unjoined
        0 running and detached

原文

The following code is a test to test what I have already done with my new found knoledge of threads.

#!/usr/bin/perl
use strict;
use warnings;
use threads;
use threads::shared;
use URI;
use URI::http;
use File::Basename;
use DBI;
use HTML::Parser;
use LWP::Simple;
require LWP::UserAgent;
my $ua = LWP::UserAgent->new;
$ua->timeout(10);
$ua->env_proxy;
$ua->max_redirect(0);

print "Starting main program\n";

my @urls = ('http://www.actwebdesigns.co.uk', 'http://www.1st4pets.com', 'http://www.special4you.com');
my @threads;
while ( @urls ) {
        my $url = shift ( @urls );
        my $t = threads->new(\&scan, $url);
        push(@threads,$t);
}
while (@threads) {
        my $url_thread = shift(@threads)->join;
}
sub resolve_href {
    my ($base, $href) = @_;
    my $u = URI->new_abs($href, $base);
    return $u->canonical;   
}
sub redirect_test {
    my $url = shift;
    my $redirect_limit = 10;
    my $y = 0;
    my( $response, $responseCode );
    while( 1 && $y le $redirect_limit ) {
        $response = $ua->get($url);
        $responseCode = $response->code;
        if( $responseCode == 200 || $responseCode == 301 || $responseCode == 302 ) {
            if( $responseCode == 301 || $responseCode == 302 ) {
                $url = resolve_href( $url, $response->header('Location') );
            }else{
                last;
            }
        }else{
            last;
        }
        $y++;
    }
    return ($url, $response, $responseCode, $redirect_limit, $y );
}
sub scan {
        my $url = shift;
        my @hrefs_found;
        print "started scanning: $url\n";
        my $info = URI::http->new($url);
        # if url is not an absolute url
        if( ! defined( $info->host ) ) {
            print "Invalid URL: $url \n";    
        }else{
             my $host = $info->host;
            $host =~ s/^www\.//;
            # check to see if url is valid, checks for redirects (max of 10)
            my @urlI = redirect_test( $url );
            my $content = '';
            # checks to see if url did not redirect more than 10 times and that response returned was 200
            if( $urlI[4] != $urlI[3] && $urlI[2] == 200 ) { 
                $content = $urlI[1]->content;
                die "get failed: " . $urlI[0] if ( ! defined $content );
            }
            # sticks all hrefs on a page in an array
            my @pageLinksArray = ( $content =~ m/href=["']([^"']*)["']/g );
            # foreach links found
            foreach( @pageLinksArray ) {
                # make href an absolute url
                my $url_found = resolve_href( $urlI[0], $_ );
                # check if url looks like a valid url
                if( $url_found =~ m/^http:\/\// ) {
                    my $info = URI::http->new($url_found);
                    # check to see if url is a valid url
                    if( ! defined( $info->host ) ) {
                        print "Invalid URL: $url_found \n";    
                    }else{
                        my %values_index;
                        @values_index{@hrefs_found} = ();
                        my %values_index2;
                        @values_index2{@urls} = ();
                        # if url is not already been found
                        if( ! exists $values_index{$url_found} && ! exists $values_index2{$url_found} ) {
                            # add to arrays
                            push( @hrefs_found, $url_found );
                            push( @urls, $url_found );
                        }
                    }
                }
            }
            print "$url found " . scalar @hrefs_found . "\n";

        }
        return $url;
}

The problem being, near the end of the script the new found urls are added into the arrays but the code at the top of the script is not processing them i.e. it is only going through the first test urls.

Can anyone see why this is happening?

Regards,

Phil

EDIT **

I have tried to pause it by doing something like this:

while ( @urls ) {
my $url = shift ( @urls );
my $t = threads->new(\&scan, $url);
push(@threads,$t);
my $n = 0;
while( 1 ) {
    if( scalar @urls == 1 ) {
        sleep 10;
    }else{
        last;
    }
    if( $n >= 1 ) {
        print "IN ARRAY URLS:\n\n";
        print @urls;
        print "\n\n";
        die "Process taking too long.";
        last;
    }
    $n++;
}

}

But it doesn't seem to do anything.

the result being:

Starting main program
started scanning: http://www.actwebdesigns.co.uk
started scanning: http://www.1st4pets.com
http://www.actwebdesigns.co.uk found 24
http://www.1st4pets.com found 17
IN ARRAY URLS:

http://www.stackoverflow.com

Process taking too long. at C:\perlscripts\thread.pl line 38.
Perl exited with active threads:
        0 running and unjoined
        2 finished and unjoined
        0 running and detached

分享到QQ

分享到微博