blob: 5064668c77ea74e30ae82572766724d59f79c109 [file] [log] [blame]
Guido van Rossum9f824a71995-08-10 19:29:28 +00001"""A dumb and slow but simple dbm clone.
2
3For database spam, spam.dir contains the index (a text file),
4spam.bak *may* contain a backup of the index (also a text file),
5while spam.dat contains the data (a binary file).
6
7XXX TO DO:
8
9- seems to contain a bug when updating...
10
11- reclaim free space (currently, space once occupied by deleted or expanded
12items is never reused)
13
14- support concurrent access (currently, if two processes take turns making
15updates, they can mess up the index)
16
17- support efficient access to large databases (currently, the whole index
18is read when the database is opened, and some updates rewrite the whole index)
19
20- support opening for read-only (flag = 'm')
21
22"""
23
Serhiy Storchaka74eb8b22015-02-16 00:30:43 +020024import ast as _ast
Guido van Rossum6252e102007-05-23 20:51:02 +000025import io as _io
Martin v. Löwisd0cd95c2001-07-19 10:06:39 +000026import os as _os
Serhiy Storchaka2e576f52017-04-24 09:05:00 +030027import collections.abc
Guido van Rossum9f824a71995-08-10 19:29:28 +000028
Georg Brandl0a7ac7d2008-05-26 10:29:35 +000029__all__ = ["error", "open"]
30
Guido van Rossum9f824a71995-08-10 19:29:28 +000031_BLOCKSIZE = 512
32
Andrew Svetlovf7a17b42012-12-25 16:47:37 +020033error = OSError
Guido van Rossum22a18901996-05-28 22:58:40 +000034
Serhiy Storchaka2e576f52017-04-24 09:05:00 +030035class _Database(collections.abc.MutableMapping):
Guido van Rossum9f824a71995-08-10 19:29:28 +000036
Tim Petersd7472ec2003-07-13 02:22:03 +000037 # The on-disk directory and data files can remain in mutually
38 # inconsistent states for an arbitrarily long time (see comments
39 # at the end of __setitem__). This is only repaired when _commit()
40 # gets called. One place _commit() gets called is from __del__(),
41 # and if that occurs at program shutdown time, module globals may
42 # already have gotten rebound to None. Since it's crucial that
Tim Peters03204642003-07-13 02:37:05 +000043 # _commit() finish successfully, we can't ignore shutdown races
Tim Petersd7472ec2003-07-13 02:22:03 +000044 # here, and _commit() must not reference any globals.
45 _os = _os # for _commit()
Guido van Rossum6252e102007-05-23 20:51:02 +000046 _io = _io # for _commit()
Tim Petersd7472ec2003-07-13 02:22:03 +000047
Serhiy Storchakab398d332014-06-10 21:16:00 +030048 def __init__(self, filebasename, mode, flag='c'):
Fred Drake2c8373b2001-12-07 21:54:46 +000049 self._mode = mode
Serhiy Storchaka0122ae92016-07-06 12:21:58 +030050 self._readonly = (flag == 'r')
Tim Peters7dfd5702003-07-12 20:11:25 +000051
52 # The directory file is a text file. Each line looks like
53 # "%r, (%d, %d)\n" % (key, pos, siz)
54 # where key is the string key, pos is the offset into the dat
55 # file of the associated value's first byte, and siz is the number
56 # of bytes in the associated value.
Skip Montanaro7a98be22007-08-16 14:35:24 +000057 self._dirfile = filebasename + '.dir'
Tim Peters7dfd5702003-07-12 20:11:25 +000058
59 # The data file is a binary file pointed into by the directory
60 # file, and holds the values associated with keys. Each value
61 # begins at a _BLOCKSIZE-aligned byte offset, and is a raw
62 # binary 8-bit string value.
Skip Montanaro7a98be22007-08-16 14:35:24 +000063 self._datfile = filebasename + '.dat'
64 self._bakfile = filebasename + '.bak'
Tim Peters7dfd5702003-07-12 20:11:25 +000065
66 # The index is an in-memory dict, mirroring the directory file.
67 self._index = None # maps keys to (pos, siz) pairs
68
Serhiy Storchakab398d332014-06-10 21:16:00 +030069 # Handle the creation
70 self._create(flag)
Serhiy Storchaka4fc79422016-12-07 11:11:12 +020071 self._update(flag)
Serhiy Storchakab398d332014-06-10 21:16:00 +030072
73 def _create(self, flag):
74 if flag == 'n':
75 for filename in (self._datfile, self._bakfile, self._dirfile):
76 try:
77 _os.remove(filename)
78 except OSError:
79 pass
Tim Peters88869f92001-01-14 23:36:06 +000080 # Mod by Jack: create data file if needed
81 try:
Brett Cannon2b5d6eb2008-11-24 21:09:58 +000082 f = _io.open(self._datfile, 'r', encoding="Latin-1")
Andrew Svetlovf7a17b42012-12-25 16:47:37 +020083 except OSError:
Serhiy Storchaka0122ae92016-07-06 12:21:58 +030084 if flag not in ('c', 'n'):
85 import warnings
86 warnings.warn("The database file is missing, the "
87 "semantics of the 'c' flag will be used.",
88 DeprecationWarning, stacklevel=4)
Serhiy Storchaka65c623d2014-06-25 20:35:31 +030089 with _io.open(self._datfile, 'w', encoding="Latin-1") as f:
90 self._chmod(self._datfile)
91 else:
92 f.close()
Guido van Rossum9f824a71995-08-10 19:29:28 +000093
Tim Peters7dfd5702003-07-12 20:11:25 +000094 # Read directory file into the in-memory index dict.
Serhiy Storchaka4fc79422016-12-07 11:11:12 +020095 def _update(self, flag):
Tim Peters88869f92001-01-14 23:36:06 +000096 self._index = {}
97 try:
Brett Cannon2b5d6eb2008-11-24 21:09:58 +000098 f = _io.open(self._dirfile, 'r', encoding="Latin-1")
Andrew Svetlovf7a17b42012-12-25 16:47:37 +020099 except OSError:
Serhiy Storchaka028ace12016-12-07 10:56:39 +0200100 self._modified = not self._readonly
Serhiy Storchaka4fc79422016-12-07 11:11:12 +0200101 if flag not in ('c', 'n'):
102 import warnings
103 warnings.warn("The index file is missing, the "
104 "semantics of the 'c' flag will be used.",
105 DeprecationWarning, stacklevel=4)
Tim Peters88869f92001-01-14 23:36:06 +0000106 else:
Serhiy Storchaka028ace12016-12-07 10:56:39 +0200107 self._modified = False
Serhiy Storchaka65c623d2014-06-25 20:35:31 +0300108 with f:
109 for line in f:
110 line = line.rstrip()
Serhiy Storchaka74eb8b22015-02-16 00:30:43 +0200111 key, pos_and_siz_pair = _ast.literal_eval(line)
Serhiy Storchaka65c623d2014-06-25 20:35:31 +0300112 key = key.encode('Latin-1')
113 self._index[key] = pos_and_siz_pair
Tim Peters88869f92001-01-14 23:36:06 +0000114
Tim Peters7dfd5702003-07-12 20:11:25 +0000115 # Write the index dict to the directory file. The original directory
116 # file (if any) is renamed with a .bak extension first. If a .bak
117 # file currently exists, it's deleted.
Tim Peters88869f92001-01-14 23:36:06 +0000118 def _commit(self):
Tim Petersd7472ec2003-07-13 02:22:03 +0000119 # CAUTION: It's vital that _commit() succeed, and _commit() can
120 # be called from __del__(). Therefore we must never reference a
121 # global in this routine.
Serhiy Storchaka028ace12016-12-07 10:56:39 +0200122 if self._index is None or not self._modified:
Tim Peters7a6c7332003-07-13 17:21:10 +0000123 return # nothing to do
124
Tim Peters7dfd5702003-07-12 20:11:25 +0000125 try:
Tim Petersd7472ec2003-07-13 02:22:03 +0000126 self._os.unlink(self._bakfile)
Andrew Svetlovad28c7f2012-12-18 22:02:39 +0200127 except OSError:
Tim Peters7dfd5702003-07-12 20:11:25 +0000128 pass
129
130 try:
Tim Petersd7472ec2003-07-13 02:22:03 +0000131 self._os.rename(self._dirfile, self._bakfile)
Andrew Svetlovad28c7f2012-12-18 22:02:39 +0200132 except OSError:
Tim Peters7dfd5702003-07-12 20:11:25 +0000133 pass
134
Serhiy Storchaka65c623d2014-06-25 20:35:31 +0300135 with self._io.open(self._dirfile, 'w', encoding="Latin-1") as f:
136 self._chmod(self._dirfile)
137 for key, pos_and_siz_pair in self._index.items():
138 # Use Latin-1 since it has no qualms with any value in any
139 # position; UTF-8, though, does care sometimes.
140 entry = "%r, %r\n" % (key.decode('Latin-1'), pos_and_siz_pair)
141 f.write(entry)
Tim Peters88869f92001-01-14 23:36:06 +0000142
Skip Montanaro6d068152003-07-14 12:15:15 +0000143 sync = _commit
144
Benjamin Petersone3083d32014-04-26 16:56:52 -0400145 def _verify_open(self):
146 if self._index is None:
147 raise error('DBM object has already been closed')
148
Tim Peters88869f92001-01-14 23:36:06 +0000149 def __getitem__(self, key):
Brett Cannon58425d32008-11-21 00:17:53 +0000150 if isinstance(key, str):
151 key = key.encode('utf-8')
Benjamin Petersone3083d32014-04-26 16:56:52 -0400152 self._verify_open()
Tim Peters88869f92001-01-14 23:36:06 +0000153 pos, siz = self._index[key] # may raise KeyError
Serhiy Storchaka65c623d2014-06-25 20:35:31 +0300154 with _io.open(self._datfile, 'rb') as f:
155 f.seek(pos)
156 dat = f.read(siz)
Tim Peters88869f92001-01-14 23:36:06 +0000157 return dat
158
Tim Peters7dfd5702003-07-12 20:11:25 +0000159 # Append val to the data file, starting at a _BLOCKSIZE-aligned
160 # offset. The data file is first padded with NUL bytes (if needed)
161 # to get to an aligned offset. Return pair
162 # (starting offset of val, len(val))
Tim Peters88869f92001-01-14 23:36:06 +0000163 def _addval(self, val):
Serhiy Storchaka65c623d2014-06-25 20:35:31 +0300164 with _io.open(self._datfile, 'rb+') as f:
165 f.seek(0, 2)
166 pos = int(f.tell())
167 npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE
168 f.write(b'\0'*(npos-pos))
169 pos = npos
170 f.write(val)
Tim Peters88869f92001-01-14 23:36:06 +0000171 return (pos, len(val))
172
Tim Peters7dfd5702003-07-12 20:11:25 +0000173 # Write val to the data file, starting at offset pos. The caller
174 # is responsible for ensuring that there's enough room starting at
175 # pos to hold val, without overwriting some other value. Return
176 # pair (pos, len(val)).
Tim Peters88869f92001-01-14 23:36:06 +0000177 def _setval(self, pos, val):
Serhiy Storchaka65c623d2014-06-25 20:35:31 +0300178 with _io.open(self._datfile, 'rb+') as f:
179 f.seek(pos)
180 f.write(val)
Tim Peters88869f92001-01-14 23:36:06 +0000181 return (pos, len(val))
182
Tim Peters7dfd5702003-07-12 20:11:25 +0000183 # key is a new key whose associated value starts in the data file
Tim Peters1d8d7292003-07-13 02:05:47 +0000184 # at offset pos and with length siz. Add an index record to
185 # the in-memory index dict, and append one to the directory file.
Tim Peters7dfd5702003-07-12 20:11:25 +0000186 def _addkey(self, key, pos_and_siz_pair):
187 self._index[key] = pos_and_siz_pair
Serhiy Storchaka65c623d2014-06-25 20:35:31 +0300188 with _io.open(self._dirfile, 'a', encoding="Latin-1") as f:
189 self._chmod(self._dirfile)
190 f.write("%r, %r\n" % (key.decode("Latin-1"), pos_and_siz_pair))
Tim Peters88869f92001-01-14 23:36:06 +0000191
192 def __setitem__(self, key, val):
Serhiy Storchaka0122ae92016-07-06 12:21:58 +0300193 if self._readonly:
194 import warnings
195 warnings.warn('The database is opened for reading only',
196 DeprecationWarning, stacklevel=2)
Brett Cannon58425d32008-11-21 00:17:53 +0000197 if isinstance(key, str):
198 key = key.encode('utf-8')
199 elif not isinstance(key, (bytes, bytearray)):
200 raise TypeError("keys must be bytes or strings")
Brett Cannon2b5d6eb2008-11-24 21:09:58 +0000201 if isinstance(val, str):
202 val = val.encode('utf-8')
203 elif not isinstance(val, (bytes, bytearray)):
204 raise TypeError("values must be bytes or strings")
Benjamin Petersone3083d32014-04-26 16:56:52 -0400205 self._verify_open()
Serhiy Storchaka028ace12016-12-07 10:56:39 +0200206 self._modified = True
Tim Peters7dfd5702003-07-12 20:11:25 +0000207 if key not in self._index:
208 self._addkey(key, self._addval(val))
Tim Peters88869f92001-01-14 23:36:06 +0000209 else:
Tim Peters7dfd5702003-07-12 20:11:25 +0000210 # See whether the new value is small enough to fit in the
211 # (padded) space currently occupied by the old value.
Tim Peters88869f92001-01-14 23:36:06 +0000212 pos, siz = self._index[key]
Tim Petersef6573e2003-07-11 04:09:55 +0000213 oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE
214 newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE
Tim Peters88869f92001-01-14 23:36:06 +0000215 if newblocks <= oldblocks:
Tim Peters7dfd5702003-07-12 20:11:25 +0000216 self._index[key] = self._setval(pos, val)
Tim Peters88869f92001-01-14 23:36:06 +0000217 else:
Tim Peters7dfd5702003-07-12 20:11:25 +0000218 # The new value doesn't fit in the (padded) space used
219 # by the old value. The blocks used by the old value are
220 # forever lost.
221 self._index[key] = self._addval(val)
222
223 # Note that _index may be out of synch with the directory
224 # file now: _setval() and _addval() don't update the directory
Tim Peters1d8d7292003-07-13 02:05:47 +0000225 # file. This also means that the on-disk directory and data
226 # files are in a mutually inconsistent state, and they'll
227 # remain that way until _commit() is called. Note that this
228 # is a disaster (for the database) if the program crashes
229 # (so that _commit() never gets called).
Tim Peters88869f92001-01-14 23:36:06 +0000230
231 def __delitem__(self, key):
Serhiy Storchaka0122ae92016-07-06 12:21:58 +0300232 if self._readonly:
233 import warnings
234 warnings.warn('The database is opened for reading only',
235 DeprecationWarning, stacklevel=2)
Brett Cannon58425d32008-11-21 00:17:53 +0000236 if isinstance(key, str):
237 key = key.encode('utf-8')
Benjamin Petersone3083d32014-04-26 16:56:52 -0400238 self._verify_open()
Serhiy Storchaka028ace12016-12-07 10:56:39 +0200239 self._modified = True
Tim Peters7dfd5702003-07-12 20:11:25 +0000240 # The blocks used by the associated value are lost.
Tim Peters88869f92001-01-14 23:36:06 +0000241 del self._index[key]
Tim Peters7dfd5702003-07-12 20:11:25 +0000242 # XXX It's unclear why we do a _commit() here (the code always
Georg Brandld9e833c2010-12-04 09:14:36 +0000243 # XXX has, so I'm not changing it). __setitem__ doesn't try to
Tim Peters7dfd5702003-07-12 20:11:25 +0000244 # XXX keep the directory file in synch. Why should we? Or
245 # XXX why shouldn't __setitem__?
Tim Peters88869f92001-01-14 23:36:06 +0000246 self._commit()
247
248 def keys(self):
Serhiy Storchaka12c575f2014-05-28 18:49:52 +0300249 try:
250 return list(self._index)
251 except TypeError:
252 raise error('DBM object has already been closed') from None
Guido van Rossum7b4beea2007-08-28 00:09:54 +0000253
254 def items(self):
Benjamin Petersone3083d32014-04-26 16:56:52 -0400255 self._verify_open()
Brett Cannon58425d32008-11-21 00:17:53 +0000256 return [(key, self[key]) for key in self._index.keys()]
Tim Peters88869f92001-01-14 23:36:06 +0000257
Fred Drakea7cc69e2001-05-03 04:55:47 +0000258 def __contains__(self, key):
Brett Cannon58425d32008-11-21 00:17:53 +0000259 if isinstance(key, str):
260 key = key.encode('utf-8')
Serhiy Storchaka12c575f2014-05-28 18:49:52 +0300261 try:
262 return key in self._index
263 except TypeError:
264 if self._index is None:
265 raise error('DBM object has already been closed') from None
266 else:
267 raise
Fred Drakea7cc69e2001-05-03 04:55:47 +0000268
269 def iterkeys(self):
Serhiy Storchaka12c575f2014-05-28 18:49:52 +0300270 try:
271 return iter(self._index)
272 except TypeError:
273 raise error('DBM object has already been closed') from None
Fred Drakea7cc69e2001-05-03 04:55:47 +0000274 __iter__ = iterkeys
275
Tim Peters88869f92001-01-14 23:36:06 +0000276 def __len__(self):
Serhiy Storchaka12c575f2014-05-28 18:49:52 +0300277 try:
278 return len(self._index)
279 except TypeError:
280 raise error('DBM object has already been closed') from None
Tim Peters88869f92001-01-14 23:36:06 +0000281
282 def close(self):
Serhiy Storchaka7e7a3db2015-04-10 13:24:41 +0300283 try:
284 self._commit()
285 finally:
286 self._index = self._datfile = self._dirfile = self._bakfile = None
Guido van Rossum9f824a71995-08-10 19:29:28 +0000287
Tim Peters7a6c7332003-07-13 17:21:10 +0000288 __del__ = close
Tim Peterse4418602002-02-16 07:34:19 +0000289
Georg Brandld9e833c2010-12-04 09:14:36 +0000290 def _chmod(self, file):
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000291 if hasattr(self._os, 'chmod'):
292 self._os.chmod(file, self._mode)
Anthony Baxtered905702001-12-21 05:13:37 +0000293
Nick Coghlanc610aba2013-11-17 15:59:51 +1000294 def __enter__(self):
295 return self
296
297 def __exit__(self, *args):
298 self.close()
299
Guido van Rossum9f824a71995-08-10 19:29:28 +0000300
Serhiy Storchakab398d332014-06-10 21:16:00 +0300301def open(file, flag='c', mode=0o666):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000302 """Open the database file, filename, and return corresponding object.
303
304 The flag argument, used to control how the database is opened in the
Serhiy Storchakab398d332014-06-10 21:16:00 +0300305 other DBM implementations, supports only the semantics of 'c' and 'n'
306 values. Other values will default to the semantics of 'c' value:
307 the database will always opened for update and will be created if it
308 does not exist.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000309
310 The optional mode argument is the UNIX mode of the file, used only when
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000311 the database has to be created. It defaults to octal code 0o666 (and
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000312 will be modified by the prevailing umask).
313
314 """
Thomas Wouters902d6eb2007-01-09 23:18:33 +0000315
316 # Modify mode depending on the umask
317 try:
318 um = _os.umask(0)
319 _os.umask(um)
320 except AttributeError:
321 pass
322 else:
323 # Turn off any bits that are set in the umask
324 mode = mode & (~um)
Serhiy Storchaka0122ae92016-07-06 12:21:58 +0300325 if flag not in ('r', 'w', 'c', 'n'):
326 import warnings
327 warnings.warn("Flag must be one of 'r', 'w', 'c', or 'n'",
328 DeprecationWarning, stacklevel=2)
Serhiy Storchakab398d332014-06-10 21:16:00 +0300329 return _Database(file, mode, flag=flag)