在python中从txt或json文件中提取非结构化数据
我有一个非结构化数据。它在我的服务器上,因此我将其以 txt 格式保存在我的计算机上以迭代和过滤数据。这就是我的代码的样子:
with open('dev.txt') as f:
dev = f.read()
def parser_dev(dev):
filtered_status = {}
vpn_list = re.findall("vpn.*", dev, re.MULTILINE)
status_list = re.findall("{\"devices.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*", dev, re.MULTILINE)
ETA_productid = ['1','2','8','9','21','22']
switch_productid=['34']
for i, pattern in enumerate(status_list):
json_object = json.loads(pattern)
adapters = []
switches = {}
for key, value in json_object["devices"].items():
if key.startswith("ETA"):
#print('key---',key)
if value["EbStatus"] != "0x80c0":
#print('value---',value)
if value["productid"] in ETA_productid:
adapters.append(key)
if value["productid"] in switch_productid:
switches[key] = value["canaddr"]
if len(adapters) > 0 or len(switches) > 0:
filtered_status[vpn_list[i]] = {
'adapters':adapters,
'switches':switches
}
return filtered_status
一旦我运行这个,我就会遇到一个错误:
Traceback (most recent call last):
File "c:\Users\vaibhav.ghildiyal\Desktop\logParser.py", line 159, in <module>
filtered_vpns = parser_dev(dev)
File "c:\Users\vaibhav.ghildiyal\Desktop\logParser.py", line 73, in parser_dev
json_object = json.loads(pattern)
File "C:\Python310\lib\json\__init__.py", line 346, in loads
return _default_decoder.decode(s)
File "C:\Python310\lib\json\decoder.py", line 340, in decode
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 8 column 1 (char 174)
当我检查我的 txt 文件时,我意识到这个问题,该文件非常非结构化。这就是我的 txt 的样子:
vpn13_000000029
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4
{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"1","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"124","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x80c0","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"154","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x80c0","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}
vpn13_000000005
vpn13_000000028
vpn13_000000015
solaredge.py = "0.14"
emsc2 = /opt/emsc2/emsc2.0.9.12
solardge.py procs = 4
{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x4040","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"227","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x4040","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"147210004","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x2040","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}
vpn13_000000015
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4
{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"116","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x4040","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"163","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x4180","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}
vpn13_000000006
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 0
{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x4040","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"170","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x4040","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"172","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x8080","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}
vpn13_000000027
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4
{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"217","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x80c0","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"218","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x80c0","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}
tannheim01
solaredge.py = "0.14"
emsc2 = /opt/emsc2/emsc2.0.9.12
solardge.py procs = 4
{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}
vpn13_000000014
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4
{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"1","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"142","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x80c0","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"179","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x80c0","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}
很高兴听到一些有关如何从此数据结构中过滤掉重要数据和一些键值对的建议。示例:
ID 列表= vpn13_000000005,vpn13_000000028 ,vpn13_000000015,tannheim01
键值对 = EbStatus":"0x80c0"
键值对 =productid":"4"
列表ETA = ETA1、ETA2、ETA3
等等。谢谢你!
I have an unstructured data. Its on my server so, I'm saving it in txt format on my computer to iterate and filter the data. This is how my code looks like:
with open('dev.txt') as f:
dev = f.read()
def parser_dev(dev):
filtered_status = {}
vpn_list = re.findall("vpn.*", dev, re.MULTILINE)
status_list = re.findall("{\"devices.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*\n.*", dev, re.MULTILINE)
ETA_productid = ['1','2','8','9','21','22']
switch_productid=['34']
for i, pattern in enumerate(status_list):
json_object = json.loads(pattern)
adapters = []
switches = {}
for key, value in json_object["devices"].items():
if key.startswith("ETA"):
#print('key---',key)
if value["EbStatus"] != "0x80c0":
#print('value---',value)
if value["productid"] in ETA_productid:
adapters.append(key)
if value["productid"] in switch_productid:
switches[key] = value["canaddr"]
if len(adapters) > 0 or len(switches) > 0:
filtered_status[vpn_list[i]] = {
'adapters':adapters,
'switches':switches
}
return filtered_status
As soon as I run this, I have an error:
Traceback (most recent call last):
File "c:\Users\vaibhav.ghildiyal\Desktop\logParser.py", line 159, in <module>
filtered_vpns = parser_dev(dev)
File "c:\Users\vaibhav.ghildiyal\Desktop\logParser.py", line 73, in parser_dev
json_object = json.loads(pattern)
File "C:\Python310\lib\json\__init__.py", line 346, in loads
return _default_decoder.decode(s)
File "C:\Python310\lib\json\decoder.py", line 340, in decode
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 8 column 1 (char 174)
I realized that this problem when I checked my txt file which was pretty unstructured. This is how my txt looks like:
vpn13_000000029
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4
{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"1","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"124","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x80c0","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"154","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x80c0","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}
vpn13_000000005
vpn13_000000028
vpn13_000000015
solaredge.py = "0.14"
emsc2 = /opt/emsc2/emsc2.0.9.12
solardge.py procs = 4
{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x4040","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"227","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x4040","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"147210004","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x2040","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}
vpn13_000000015
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4
{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"116","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x4040","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"163","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x4180","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}
vpn13_000000006
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 0
{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x4040","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"170","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x4040","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"172","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x8080","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}
vpn13_000000027
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4
{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"217","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x80c0","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"218","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x80c0","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}
tannheim01
solaredge.py = "0.14"
emsc2 = /opt/emsc2/emsc2.0.9.12
solardge.py procs = 4
{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}
vpn13_000000014
solaredge.py = "0.16"
emsc2 = /opt/emsc2/emsc2.0.9.13
solardge.py procs = 4
{"devices":
{"EMSC2":
{"canaddr":"0.1","deviceType":"EMSC2","ipaddr":"0.0.0.0","productid":"4","revision":"1","serial":"0","status":"1","vendorid":"969"
},"ETA1":
{"EbStatus":"0x80c0","canaddr":"0.2","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"142","status":"0","vendorid":"969"
},"ETA2":
{"EbStatus":"0x80c0","canaddr":"0.3","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"9","revision":"3791667201","serial":"179","status":"0","vendorid":"969"
},"ETA3":
{"EbStatus":"0x80c0","canaddr":"0.4","deviceType":"ETA","ipaddr":"0.0.0.0","productid":"34","revision":"2684362766","serial":"0","status":"0","vendorid":"969"
}
},"errorcode":0
}
Would be great to hear some suggestions on how to filter out important data and some key value pairs from this data structure. Example:
a list of ids= vpn13_000000005,vpn13_000000028 ,vpn13_000000015,tannheim01
key value pairs for = EbStatus":"0x80c0"
key value pairs for = productid":"4"
a list of ETAs = ETA1, ETA2, ETA3
so on and so forth. Thank you!
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论