在python中从txt或json文件中提取非结构化数据

发布于 2025-01-19 16:23:52 字数 7205 浏览 0 评论 0原文

我有一个非结构化数据。它在我的服务器上,因此我将其以 txt 格式保存在我的计算机上以迭代和过滤数据。这就是我的代码的样子:

with open('dev.txt') as f:
    dev = f.read()

def parser_dev(dev):
    filtered_status = {}
    vpn_list = re.findall("vpn.*", dev, re.MULTILINE)
    status_list = re.findall("{\"devices.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*", dev, re.MULTILINE)
    ETA_productid = ['1','2','8','9','21','22']
    switch_productid=['34']
    for i, pattern in enumerate(status_list):
        json_object = json.loads(pattern)
        adapters = []
        switches = {}
        for key, value in json_object["devices"].items():
            if key.startswith("ETA"):
                #print('key---',key)
                if value["EbStatus"] != "0x80c0":
                    #print('value---',value)
                    if value["productid"] in ETA_productid:
                        adapters.append(key)
                    if value["productid"] in switch_productid:
                        switches[key] = value["canaddr"]

        if len(adapters) > 0 or len(switches) > 0:
            filtered_status[vpn_list[i]] = {
                'adapters':adapters,
                'switches':switches
            }
    return filtered_status

一旦我运行这个,我就会遇到一个错误:

Traceback (most recent call last):
  File "c:\Users\vaibhav.ghildiyal\Desktop\logParser.py", line 159, in <module> 
    filtered_vpns = parser_dev(dev)
  File "c:\Users\vaibhav.ghildiyal\Desktop\logParser.py", line 73, in parser_dev
    json_object = json.loads(pattern)
  File "C:\Python310\lib\json\__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "C:\Python310\lib\json\decoder.py", line 340, in decode
    raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 8 column 1 (char 174)

当我检查我的 txt 文件时,我意识到这个问题,该文件非常非结构化。这就是我的 txt 的样子:


vpn13_000000029
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4

{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"1","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"124","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x80c0","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"154","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x80c0","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}

vpn13_000000005
vpn13_000000028
vpn13_000000015
solaredge.py = "0.14"
emsc2 = /opt/emsc2/emsc2.0.9.12
solardge.py procs = 4

{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x4040","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"227","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x4040","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"147210004","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x2040","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}

vpn13_000000015
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4

{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"116","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x4040","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"163","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x4180","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}

vpn13_000000006
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 0

{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x4040","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"170","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x4040","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"172","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x8080","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}

vpn13_000000027
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4

{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"217","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x80c0","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"218","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x80c0","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}

tannheim01
solaredge.py = "0.14"
emsc2 = /opt/emsc2/emsc2.0.9.12
solardge.py procs = 4

{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}

vpn13_000000014
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4

{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"1","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"142","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x80c0","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"179","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x80c0","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}


很高兴听到一些有关如何从此数据结构中过滤掉重要数据和一些键值对的建议。示例:

ID 列表= vpn13_000000005,vpn13_000000028 ,vpn13_000000015,tannheim01

键值对 = EbStatus":"0x80c0"

键值对 =productid":"4"

列表ETA = ETA1、ETA2、ETA3

等等。谢谢你!

I have an unstructured data. Its on my server so, I'm saving it in txt format on my computer to iterate and filter the data. This is how my code looks like:

with open('dev.txt') as f:
    dev = f.read()

def parser_dev(dev):
    filtered_status = {}
    vpn_list = re.findall("vpn.*", dev, re.MULTILINE)
    status_list = re.findall("{\"devices.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*", dev, re.MULTILINE)
    ETA_productid = ['1','2','8','9','21','22']
    switch_productid=['34']
    for i, pattern in enumerate(status_list):
        json_object = json.loads(pattern)
        adapters = []
        switches = {}
        for key, value in json_object["devices"].items():
            if key.startswith("ETA"):
                #print('key---',key)
                if value["EbStatus"] != "0x80c0":
                    #print('value---',value)
                    if value["productid"] in ETA_productid:
                        adapters.append(key)
                    if value["productid"] in switch_productid:
                        switches[key] = value["canaddr"]

        if len(adapters) > 0 or len(switches) > 0:
            filtered_status[vpn_list[i]] = {
                'adapters':adapters,
                'switches':switches
            }
    return filtered_status

As soon as I run this, I have an error:

Traceback (most recent call last):
  File "c:\Users\vaibhav.ghildiyal\Desktop\logParser.py", line 159, in <module> 
    filtered_vpns = parser_dev(dev)
  File "c:\Users\vaibhav.ghildiyal\Desktop\logParser.py", line 73, in parser_dev
    json_object = json.loads(pattern)
  File "C:\Python310\lib\json\__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "C:\Python310\lib\json\decoder.py", line 340, in decode
    raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 8 column 1 (char 174)

I realized that this problem when I checked my txt file which was pretty unstructured. This is how my txt looks like:


vpn13_000000029
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4

{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"1","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"124","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x80c0","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"154","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x80c0","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}

vpn13_000000005
vpn13_000000028
vpn13_000000015
solaredge.py = "0.14"
emsc2 = /opt/emsc2/emsc2.0.9.12
solardge.py procs = 4

{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x4040","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"227","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x4040","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"147210004","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x2040","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}

vpn13_000000015
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4

{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"116","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x4040","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"163","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x4180","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}

vpn13_000000006
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 0

{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x4040","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"170","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x4040","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"172","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x8080","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}

vpn13_000000027
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4

{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"217","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x80c0","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"218","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x80c0","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}

tannheim01
solaredge.py = "0.14"
emsc2 = /opt/emsc2/emsc2.0.9.12
solardge.py procs = 4

{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}

vpn13_000000014
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4

{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"1","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"142","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x80c0","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"179","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x80c0","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}


Would be great to hear some suggestions on how to filter out important data and some key value pairs from this data structure. Example:

a list of ids= vpn13_000000005,vpn13_000000028 ,vpn13_000000015,tannheim01

key value pairs for = EbStatus":"0x80c0"

key value pairs for = productid":"4"

a list of ETAs = ETA1, ETA2, ETA3

so on and so forth. Thank you!

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
列表为空,暂无数据
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文