如何将数据从STDF文件传输到Python中的Pandas DataFrame

发布于 2025-02-13 13:50:44 字数 151 浏览 0 评论 0原文

我有一个来自STDF文件格式的数据，该格式正在测试机器输出文件格式，由半导体制造行业使用我需要在Python中读取文件，并分析机器输出停机时间和其他详细信息上传到文件中我在GitHub和其他平台上搜索了解决方案，Python中没有可用的无错误模块，也没有正确记录以使用现有模块实现代码

原文

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

一身仙ぐ女味 2025-02-20 13:50:44

我建议 pystdf 。

从我的经验来看，该库完全没有错误，尽管大文件的性能有些慢。而且您仍然必须理解和整理所有记录以进行数据分析目的。

在下面使用示例（此片段将多个STDF文件读取到每种记录类型的PANDAS DataFrames中）。

import os
import pandas as pd
from io import StringIO
import pystdf.V4 as v4
from pystdf.IO import Parser
from pystdf.Writers import TextWriter


def stdf_to_dfs(filelist):
    ''' Takes a list of stdf files, and returns individual dataframes for each record type, separated per file.
    Also, prepends the line number from the atdf (as well as the source file).'''

    record_dfs = {}
    for file in filelist:
        filename = os.path.basename(file)
        p = Parser(inp=open(file, 'rb'))
        captured_std_out = StringIO()
        p.addSink(TextWriter(captured_std_out))
        p.parse()
        atdf = captured_std_out.getvalue()

        # prepend line number and source file name to captured_std_out so it can be sorted later
        # line number is 2nd field... 1st field is record_type
        atdf = atdf.split('\n')
        for n, l in enumerate(atdf):
            atdf[n] = l[:4] + str(n) + '|' + filename + '|' + l[4:]

        # read each record type into a seperate dataframe
        for record_type in v4.records:
            record_name = record_type.name.split('.')[-1].upper()
            curr = [line for line in atdf if line.startswith(record_name)]
            curr = '\n'.join(curr)
            if curr not in '':
                header_names = ['Record', 'LineNum', 'SourceFile'] + list(list(zip(*record_type.fieldMap))[0])
                if record_name not in record_dfs:
                    record_dfs[record_name] = pd.DataFrame()
                record_dfs[record_name] = pd.concat([record_dfs[record_name], pd.read_csv(
                    StringIO(curr), header=None, names=header_names, delimiter='|')])

    # drop empty record dataframes
    record_dfs = {k: v for k, v in record_dfs.items() if (v is not None)}

    return record_dfs

I suggest pystdf.

From my experience, that library is completely bug-free although the performance is somewhat slow on big files. And you'll still have to understand and sort through all the records for data analysis purposes.

Sample use below (this snippet reads multiple stdf files into pandas dataframes for each record type).

import os
import pandas as pd
from io import StringIO
import pystdf.V4 as v4
from pystdf.IO import Parser
from pystdf.Writers import TextWriter


def stdf_to_dfs(filelist):
    ''' Takes a list of stdf files, and returns individual dataframes for each record type, separated per file.
    Also, prepends the line number from the atdf (as well as the source file).'''

    record_dfs = {}
    for file in filelist:
        filename = os.path.basename(file)
        p = Parser(inp=open(file, 'rb'))
        captured_std_out = StringIO()
        p.addSink(TextWriter(captured_std_out))
        p.parse()
        atdf = captured_std_out.getvalue()

        # prepend line number and source file name to captured_std_out so it can be sorted later
        # line number is 2nd field... 1st field is record_type
        atdf = atdf.split('\n')
        for n, l in enumerate(atdf):
            atdf[n] = l[:4] + str(n) + '|' + filename + '|' + l[4:]

        # read each record type into a seperate dataframe
        for record_type in v4.records:
            record_name = record_type.name.split('.')[-1].upper()
            curr = [line for line in atdf if line.startswith(record_name)]
            curr = '\n'.join(curr)
            if curr not in '':
                header_names = ['Record', 'LineNum', 'SourceFile'] + list(list(zip(*record_type.fieldMap))[0])
                if record_name not in record_dfs:
                    record_dfs[record_name] = pd.DataFrame()
                record_dfs[record_name] = pd.concat([record_dfs[record_name], pd.read_csv(
                    StringIO(curr), header=None, names=header_names, delimiter='|')])

    # drop empty record dataframes
    record_dfs = {k: v for k, v in record_dfs.items() if (v is not None)}

    return record_dfs

回复收藏 0 原文