尝试覆盖数据时 Python Shelf 文件会增大
当我向存储在架子对象中的数据帧添加列时,我看到架子文件大小呈指数增长。这就像每次添加新列时都会创建条目的完整副本,并应用添加的列。键的数量不变。访问修改后的数据帧时会出现新列。数据帧的大小(以字节为单位)增加,但与文件大小不成比例。
import os, sys
import shelve
import numpy as np
import pandas as pd
class shelf_class():
def __init__(self, dbfile ):
# A database of Dataframes
self._db = shelve.open(dbfile)
def __getitem__(self, key):
return self._db[key]
def __setitem__(self, key, value):
self._db[key] = value
def __delitem__(self, key):
del self._db[key]
def add_sum(self, key):
new_data = self._db[key]
new_data['sum'] = self._db[key].to_numpy().sum(axis=1)
self._db[key] = new_data
def add_mean(self, key):
new_data = self._db[key]
new_data['mean'] = self._db[key].to_numpy().mean(axis=1)
self._db[key] = new_data
filename = 'my_file'
store = shelf_class('my_file')
keys = [str(x) for x in range(10)]
for i in keys:
store[i] = pd.DataFrame(np.random.random((100,10)))
s = os.path.getsize(filename + '.dat')
print(f'file size = {s:d}')
s = sys.getsizeof(store[i])
print(f'dataframe size = {s:d}')
for i in keys:
store.add_sum(i)
s = os.path.getsize(filename + '.dat')
print(f'file size = {s:d}')
s = sys.getsizeof(store[i])
print(f'dataframe size = {s:d}')
for i in keys:
store.add_mean(i)
s = os.path.getsize(filename + '.dat')
print(f'file size = {s:d}')
s = sys.getsizeof(store[i])
print(f'dataframe size = {s:d}')
这会产生:
file size = 86917
dataframe size = 8144
file size = 179152
dataframe size = 8544
file size = 276416
dataframe size = 8944
我做错了什么?
When I add columns to dataframes stored in a shelf object I am seeing exponential growth in the shelf file size. It is like a complete copy of an entry is created, with the added column applied, every time I add a new column. The number of keys does not change. The new columns are present when accessing modified dataframes. The size of the dataframes in bytes increases, but not in proportion to the file size.
import os, sys
import shelve
import numpy as np
import pandas as pd
class shelf_class():
def __init__(self, dbfile ):
# A database of Dataframes
self._db = shelve.open(dbfile)
def __getitem__(self, key):
return self._db[key]
def __setitem__(self, key, value):
self._db[key] = value
def __delitem__(self, key):
del self._db[key]
def add_sum(self, key):
new_data = self._db[key]
new_data['sum'] = self._db[key].to_numpy().sum(axis=1)
self._db[key] = new_data
def add_mean(self, key):
new_data = self._db[key]
new_data['mean'] = self._db[key].to_numpy().mean(axis=1)
self._db[key] = new_data
filename = 'my_file'
store = shelf_class('my_file')
keys = [str(x) for x in range(10)]
for i in keys:
store[i] = pd.DataFrame(np.random.random((100,10)))
s = os.path.getsize(filename + '.dat')
print(f'file size = {s:d}')
s = sys.getsizeof(store[i])
print(f'dataframe size = {s:d}')
for i in keys:
store.add_sum(i)
s = os.path.getsize(filename + '.dat')
print(f'file size = {s:d}')
s = sys.getsizeof(store[i])
print(f'dataframe size = {s:d}')
for i in keys:
store.add_mean(i)
s = os.path.getsize(filename + '.dat')
print(f'file size = {s:d}')
s = sys.getsizeof(store[i])
print(f'dataframe size = {s:d}')
This produces:
file size = 86917
dataframe size = 8144
file size = 179152
dataframe size = 8544
file size = 276416
dataframe size = 8944
What am I doing wrong?
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
这是架子的一个已知问题。这篇文章
搁置字典大小为 2Gb 的 >100Gb文本文件
提供了一种方法来清理和整理数据库文件,使用:
这要求该文件是dbm.gnu,可以使用以下命令进行检查:
This is a known problem with shelve. This post
Shelve dictionary size is >100Gb for a 2Gb text file
provides a method to clean up and defrag the database file using:
This requires the file to be a dbm.gnu, which can be checked using: