CS50 DNA PSET6在小数据库上工作，但在大数据库上不起作用

发布于 2025-02-13 02:32:05 字数 2674 浏览 0 评论 0原文

这是我解决Python中CS50 PSET6 DNA问题的方法。它在小数据库上正常工作，但给出了

索引错误：列表索引以外的范围。

我尝试打印以查看错误在哪里。它也打印出大型数据库。不确定下一步该怎么做。

import csv
import sys


def main():

    # TODO: Check for command-line usage
    if len(sys.argv) != 3:
        print("Usage: python dna.py database.csv sequence.txt")
        sys.exit(1)

    # TODO: Read database file into a variable
    dna_database =[]
    with open(sys.argv[1], "r") as dna_data_file:
        reader = csv.DictReader(dna_data_file)
        for row in reader:
            dna_database.append(row)


    # TODO: Read DNA sequence file into a variable
    with open(sys.argv[2], "r") as load_sequence:
        sequence = load_sequence.read()

    # TODO: Find longest match of each STR in DNA sequence
    STR = list(dna_database[0].keys())[1:]
    STR_match ={}
    for i in range(len(dna_database)):
        # print(dna_database)
        STR_match[STR[i]] = longest_match(sequence,STR[i])

    # TODO: Check database for matching profiles
    for i in range(len(dna_database)):
        matches = 0
        for j in range(len(STR)):
            if int(STR_match[STR[j]]) == int(dna_database[i][STR[j]]):
                matches += 1
                if matches == len(STR):
                    print(dna_database[i]['name'])
                    sys.exit(0)

    print("No Match")

    return


def longest_match(sequence, subsequence):
    """Returns length of longest run of subsequence in sequence."""

    # Initialize variables
    longest_run = 0
    subsequence_length = len(subsequence)
    sequence_length = len(sequence)

    # Check each character in sequence for most consecutive runs of subsequence
    for i in range(sequence_length):

        # Initialize count of consecutive runs
        count = 0

        # Check for a subsequence match in a "substring" (a subset of characters) within sequence
        # If a match, move substring to next potential match in sequence
        # Continue moving substring and checking for matches until out of consecutive matches
        while True:

            # Adjust substring start and end
            start = i + count * subsequence_length
            end = start + subsequence_length

            # If there is a match in the substring
            if sequence[start:end] == subsequence:
                count += 1

            # If there is no match in the substring
            else:
                break

        # Update most consecutive matches found
        longest_run = max(longest_run, count)

    # After checking for runs at each character in seqeuence, return longest run found
    return longest_run


main()

原文

This is my solution to CS50 pset6 DNA problem in python. It works fine on small database but gives an

Index error: List Index Out of range.

I tried print to see where is the error.. It prints out large database as well. Not sure what to do next.

import csv
import sys


def main():

    # TODO: Check for command-line usage
    if len(sys.argv) != 3:
        print("Usage: python dna.py database.csv sequence.txt")
        sys.exit(1)

    # TODO: Read database file into a variable
    dna_database =[]
    with open(sys.argv[1], "r") as dna_data_file:
        reader = csv.DictReader(dna_data_file)
        for row in reader:
            dna_database.append(row)


    # TODO: Read DNA sequence file into a variable
    with open(sys.argv[2], "r") as load_sequence:
        sequence = load_sequence.read()

    # TODO: Find longest match of each STR in DNA sequence
    STR = list(dna_database[0].keys())[1:]
    STR_match ={}
    for i in range(len(dna_database)):
        # print(dna_database)
        STR_match[STR[i]] = longest_match(sequence,STR[i])

    # TODO: Check database for matching profiles
    for i in range(len(dna_database)):
        matches = 0
        for j in range(len(STR)):
            if int(STR_match[STR[j]]) == int(dna_database[i][STR[j]]):
                matches += 1
                if matches == len(STR):
                    print(dna_database[i]['name'])
                    sys.exit(0)

    print("No Match")

    return


def longest_match(sequence, subsequence):
    """Returns length of longest run of subsequence in sequence."""

    # Initialize variables
    longest_run = 0
    subsequence_length = len(subsequence)
    sequence_length = len(sequence)

    # Check each character in sequence for most consecutive runs of subsequence
    for i in range(sequence_length):

        # Initialize count of consecutive runs
        count = 0

        # Check for a subsequence match in a "substring" (a subset of characters) within sequence
        # If a match, move substring to next potential match in sequence
        # Continue moving substring and checking for matches until out of consecutive matches
        while True:

            # Adjust substring start and end
            start = i + count * subsequence_length
            end = start + subsequence_length

            # If there is a match in the substring
            if sequence[start:end] == subsequence:
                count += 1

            # If there is no match in the substring
            else:
                break

        # Update most consecutive matches found
        longest_run = max(longest_run, count)

    # After checking for runs at each character in seqeuence, return longest run found
    return longest_run


main()

分享到QQ

分享到微博