diff --git a/.travis.yml b/.travis.yml index 464fafb..58aa658 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,8 +8,12 @@ python: - 3.3 - 3.4 - 3.5 +env: + # test with paths containing the unicode characters + - TMPDIR="/tmp/καλημέρα" install: - pip install coveralls + - ln -s /tmp "$TMPDIR" script: - ulimit -n 48 - ulimit -n diff --git a/doc/source/changes.rst b/doc/source/changes.rst index f99e85f..e279b63 100644 --- a/doc/source/changes.rst +++ b/doc/source/changes.rst @@ -2,34 +2,52 @@ Changelog ######### -********** +2.1.0 +====== + +- **BREAKING API:** retrofit ``git.util.mman`` as context-manager, + to release memory-mapped regions held. + + The *mmap-manager(s)* are re-entrant, but not thread-safe **context-manager(s)**, + to be used within a ``with ...:`` block, ensuring any left-overs cursors are cleaned up. + If not entered, :meth:`StaticWindowMapManager.make_cursor()` and/or + :meth:`WindowCursor.use_region()` will scream. + + Get them from ``smmap.managed_mmaps()``. + +- FIX ``memoryview`` leak in Windows; now all *gitdb* TCs now pass without explicit release! + +- Simplify :class:`SlidingWindowMapBuffer` as create/close context-manager + (no ``begin_access()``, or ``end_access()``). + + v0.9.0 -********** +======== - Fixed issue with resources never being freed as mmaps were never closed. - Client counting is now done manually, instead of relying on pyton's reference count -********** + v0.8.5 -********** +======== - Fixed Python 3.0-3.3 regression, which also causes smmap to become about 3 times slower depending on the code path. It's related to this bug (http://bugs.python.org/issue15958), which was fixed in python 3.4 -********** + v0.8.4 -********** +======== - Fixed Python 3 performance regression -********** + v0.8.3 -********** +======== - Cleaned up code and assured it works sufficiently well with python 3 -********** + v0.8.1 -********** +======== - A single bugfix -********** -v0.8.0 -********** + +v0.8.0 +======== - Initial Release diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst index 917b245..8ba8d7c 100644 --- a/doc/source/tutorial.rst +++ b/doc/source/tutorial.rst @@ -5,114 +5,138 @@ Usage Guide ########### This text briefly introduces you to the basic design decisions and accompanying classes. -****** Design -****** -Per application, there is *MemoryManager* which is held as static instance and used throughout the application. It can be configured to keep your resources within certain limits. +====== +Per application, there must be a *MemoryManager* to be used throughout the application. +It can be configured to keep your resources within certain limits (see :func:`smmap.managed_mmaps()`). -To access mapped regions, you require a cursor. Cursors point to exactly one file and serve as handles into it. As long as it exists, the respective memory region will remain available. +To access mapped regions, you require a cursor. Cursors point to exactly one file +and serve as handles into it. +As long as it exists, the respective memory region will remain available. + +For convenience, a buffer implementation is provided (:class:`smmap.SlidingWindowMapBuffer`) +which handles cursors and resource allocation behind its simple buffer like interface. -For convenience, a buffer implementation is provided which handles cursors and resource allocation behind its simple buffer like interface. -*************** Memory Managers -*************** -There are two types of memory managers, one uses *static* windows, the other one uses *sliding* windows. A window is a region of a file mapped into memory. Although the names might be somewhat misleading as technically windows are always static, the *sliding* version will allocate relatively small windows whereas the *static* version will always map the whole file. +================ +There are two types of memory managers, one uses *static* windows, the other one uses *sliding* windows. +A window is a region of a file mapped into memory. Although the names might be somewhat misleading, +as technically windows are always static, the *sliding* version will allocate relatively small windows +whereas the *static* version will always map the whole file. + +The *static* memory-manager does nothing more than keeping a client count on the respective memory maps +which always map the whole file, which allows to make some assumptions that can lead to simplified +data access and increased performance, but reduces the compatibility to 32 bit systems or giant files. + +The *sliding* memory-manager therefore should be the default manager when preparing an application +for handling huge amounts of data on 32 bit and 64 bit platforms -The *static* manager does nothing more than keeping a client count on the respective memory maps which always map the whole file, which allows to make some assumptions that can lead to simplified data access and increased performance, but reduces the compatibility to 32 bit systems or giant files. +.. Note:: + The *mmap-manager(s)* are re-entrant, but not thread-safe **context-manager(s)**, + to be used within a ``with ...:`` block, ensuring any left-overs cursors are cleaned up. + If not entered, :meth:`StaticWindowMapManager.make_cursor()` and/or + :meth:`WindowCursor.use_region()` will scream. -The *sliding* memory manager therefore should be the default manager when preparing an application for handling huge amounts of data on 32 bit and 64 bit platforms:: + +Use the :math:`smmap.managed_mmaps()` to take care of all this:: import smmap # This instance should be globally available in your application # It is configured to be well suitable for 32-bit or 64 bit applications. - mman = smmap.SlidingWindowMapManager() - - # the manager provides much useful information about its current state - # like the amount of open file handles or the amount of mapped memory - mman.num_file_handles() - mman.mapped_memory_size() - # and many more ... + with smmap.managed_mmaps() as mman: + + # the manager provides much useful information about its current state + # like the amount of open file handles or the amount of mapped memory + mman.num_file_handles() + mman.mapped_memory_size() + # and many more ... Cursors -******* +======== *Cursors* are handles that point onto a window, i.e. a region of a file mapped into memory. From them you may obtain a buffer through which the data of that window can actually be accessed:: - import smmap.test.lib - fc = smmap.test.lib.FileCreator(1024*1024*8, "test_file") - - # obtain a cursor to access some file. - c = mman.make_cursor(fc.path) - - # the cursor is now associated with the file, but not yet usable - assert c.is_associated() - assert not c.is_valid() - - # before you can use the cursor, you have to specify a window you want to - # access. The following just says you want as much data as possible starting - # from offset 0. - # To be sure your region could be mapped, query for validity - assert c.use_region().is_valid() # use_region returns self - - # once a region was mapped, you must query its dimension regularly - # to assure you don't try to access its buffer out of its bounds - assert c.size() - c.buffer()[0] # first byte - c.buffer()[1:10] # first 9 bytes - c.buffer()[c.size()-1] # last byte - - # its recommended not to create big slices when feeding the buffer - # into consumers (e.g. struct or zlib). - # Instead, either give the buffer directly, or use pythons buffer command. - buffer(c.buffer(), 1, 9) # first 9 bytes without copying them - - # you can query absolute offsets, and check whether an offset is included - # in the cursor's data. - assert c.ofs_begin() < c.ofs_end() - assert c.includes_ofs(100) - - # If you are over out of bounds with one of your region requests, the - # cursor will be come invalid. It cannot be used in that state - assert not c.use_region(fc.size, 100).is_valid() - # map as much as possible after skipping the first 100 bytes - assert c.use_region(100).is_valid() - - # You can explicitly free cursor resources by unusing the cursor's region - c.unuse_region() - assert not c.is_valid() - - -Now you would have to write your algorithms around this interface to properly slide through huge amounts of data. - + import smmap.test.lib as tlib + + with smmap.managed_mmaps() as mman, tlib.FileCreator(1024*1024*8, "test_file") as fc: + # obtain a cursor to access some file. + with mman.make_cursor(fc.path) as c: + + # the cursor is now associated with the file, but not yet usable + assert c.is_associated() + assert not c.is_valid() + + # before you can use the cursor, you have to specify a window you want to + # access. The following just says you want as much data as possible starting + # from offset 0. + # To be sure your region could be mapped, query for validity + assert c.use_region().is_valid() # use_region returns self + + # once a region was mapped, you must query its dimension regularly + # to assure you don't try to access its buffer out of its bounds + assert c.size() + c.buffer()[0] # first byte + c.buffer()[1:10] # first 9 bytes + c.buffer()[c.size()-1] # last byte + + # its recommended not to create big slices when feeding the buffer + # into consumers (e.g. struct or zlib). + # Instead, either give the buffer directly, or use pythons buffer command. + buffer(c.buffer(), 1, 9) # first 9 bytes without copying them + + # you can query absolute offsets, and check whether an offset is included + # in the cursor's data. + assert c.ofs_begin() < c.ofs_end() + assert c.includes_ofs(100) + + # If you are over out of bounds with one of your region requests, the + # cursor will be come invalid. It cannot be used in that state + assert not c.use_region(fc.size, 100).is_valid() + # map as much as possible after skipping the first 100 bytes + assert c.use_region(100).is_valid() + + # You must explicitly free cursor resources by unusing the cursor's region + c.unuse_region() + assert not c.is_valid() + + +Now you would have to write your algorithms around this interface to properly slide through huge amounts of data. + Alternatively you can use a convenience interface. -******* + +======== Buffers -******* -To make first use easier, at the expense of performance, there is a Buffer implementation which uses a cursor underneath. - -With it, you can access all data in a possibly huge file without having to take care of setting the cursor to different regions yourself:: - - # Create a default buffer which can operate on the whole file - buf = smmap.SlidingWindowMapBuffer(mman.make_cursor(fc.path)) - - # you can use it right away - assert buf.cursor().is_valid() - - buf[0] # access the first byte - buf[-1] # access the last ten bytes on the file - buf[-10:]# access the last ten bytes - - # If you want to keep the instance between different accesses, use the - # dedicated methods - buf.end_access() - assert not buf.cursor().is_valid() # you cannot use the buffer anymore - assert buf.begin_access(offset=10) # start using the buffer at an offset - - # it will stop using resources automatically once it goes out of scope - +======== +To make first use easier, at the expense of performance, there is a Buffer implementation +which uses a cursor underneath. + +With it, you can access all data in a possibly huge file +without having to take care of setting the cursor to different regions yourself:: + + ## Create a default buffer which can operate on the whole file + cur = mman.make_cursor(fc.path) + with smmap.SlidingWindowMapBuffer(cur) as buf: + # you can use it right away + assert buf.cursor().is_valid() + + buf[0] # access the first byte + buf[-1] # access the last ten bytes on the file + buf[-10:]# access the last ten bytes + + ## You cannot use the buffer anymore. + assert not buf.cursor().is_valid() + + ## If you want to keep the instance between different accesses, + # use another instance. + with smmap.SlidingWindowMapBuffer(cur, offset=10) as buf: + assert buf.cursor().is_valid() + + Disadvantages -************* -Buffers cannot be used in place of strings or maps, hence you have to slice them to have valid input for the sorts of struct and zlib. A slice means a lot of data handling overhead which makes buffers slower compared to using cursors directly. +-------------- +Buffers cannot be used in place of strings or maps, hence you have to slice them to have valid +input for the sorts of struct and zlib. +A slice means a lot of data handling overhead which makes buffers slower compared to using cursors directly. diff --git a/smmap/__init__.py b/smmap/__init__.py index 9cfd0a1..9f3e8eb 100644 --- a/smmap/__init__.py +++ b/smmap/__init__.py @@ -3,9 +3,9 @@ __author__ = "Sebastian Thiel" __contact__ = "byronimo@gmail.com" __homepage__ = "https://github.com/Byron/smmap" -version_info = (2, 0, 1) +version_info = (2, 1, 0, 'dev4') __version__ = '.'.join(str(i) for i in version_info) # make everything available in root package for convenience -from .mman import * -from .buf import * +from .mman import * # @IgnorePep8 +from .buf import * # @IgnorePep8 diff --git a/smmap/buf.py b/smmap/buf.py index 438292b..7e48ced 100644 --- a/smmap/buf.py +++ b/smmap/buf.py @@ -1,14 +1,17 @@ """Module with a simple buffer implementation using the memory manager""" import sys +import logging __all__ = ["SlidingWindowMapBuffer"] -import sys try: bytes except NameError: - bytes = str + bytes = str # @ReservedAssignment + + +log = logging.getLogger(__name__) class SlidingWindowMapBuffer(object): @@ -19,17 +22,23 @@ class SlidingWindowMapBuffer(object): The buffer is relative, that is if you map an offset, index 0 will map to the first byte at the offset you used during initialization or begin_access - **Note:** Although this type effectively hides the fact that there are mapped windows - underneath, it can unfortunately not be used in any non-pure python method which - needs a buffer or string""" + .. Tip:: + Use it as a context-manager inside a ``with SlidingWindowMapBuffer(...):`` block. + + .. Note:: + Although this type effectively hides the fact that there are mapped windows + underneath, it can unfortunately not be used in any non-pure python method which + needs a buffer or string + """ __slots__ = ( '_c', # our cursor '_size', # our supposed size + '_entered', # entry/exit accounting ) def __init__(self, cursor=None, offset=0, size=sys.maxsize, flags=0): - """Initalize the instance to operate on the given cursor. - :param cursor: if not None, the associated cursor to the file you want to access + """Initialize the instance to operate on the given cursor. + :param cursor: The associated cursor to the file you want to access If None, you have call begin_access before using the buffer and provide a cursor :param offset: absolute offset in bytes :param size: the total size of the mapping. Defaults to the maximum possible size @@ -39,24 +48,49 @@ def __init__(self, cursor=None, offset=0, size=sys.maxsize, flags=0): Hence it is in your own interest to provide a proper size ! :param flags: Additional flags to be passed to os.open :raise ValueError: if the buffer could not achieve a valid state""" + if not cursor: + raise ValueError("Cursor cannot be null!") self._c = cursor - if cursor and not self.begin_access(cursor, offset, size, flags): - raise ValueError("Failed to allocate the buffer - probably the given offset is out of bounds") - # END handle offset - - def __del__(self): - self.end_access() + self._entered = 0 + + if cursor.is_associated() and cursor.use_region(offset, size, flags).is_valid(): + # if given size is too large or default, we computer a proper size + # If its smaller, we assume the combination between offset and size + # as chosen by the user is correct and use it ! + # If not, the user is in trouble. + if size > cursor.file_size(): + size = cursor.file_size() - offset + # END handle size + self._size = size + else: + raise ValueError("Cursor %s not associated or mapping region failed!" % cursor) def __enter__(self): + assert self._entered >= 0, self._entered + self._entered += 1 return self def __exit__(self, exc_type, exc_value, traceback): - self.end_access() + assert self._entered >= 0, self._entered + self._entered -= 1 + if self._entered == 0: + self.close() + + def __del__(self): + if self._entered != 0: + log.warning("Missed %s exit(s) on %s!" % (self._entered, self)) + self.close() + + def _check_if_entered(self): + if self._entered <= 0: + raise ValueError('Context-manager %s not entered!' % self) def __len__(self): return self._size def __getitem__(self, i): + self._check_if_entered() + if isinstance(i, slice): return self.__getslice__(i.start or 0, i.stop or self._size) c = self._c @@ -69,6 +103,8 @@ def __getitem__(self, i): return c.buffer()[i - c.ofs_begin()] def __getslice__(self, i, j): + self._check_if_entered() + c = self._c # fast path, slice fully included - safes a concatenate operation and # should be the default @@ -103,7 +139,7 @@ def __getslice__(self, i, j): # END while there are bytes to read return out else: - md = list() + md = [] while l: c.use_region(ofs, l) assert c.is_valid() @@ -120,44 +156,16 @@ def __getslice__(self, i, j): # END fast or slow path #{ Interface - def begin_access(self, cursor=None, offset=0, size=sys.maxsize, flags=0): - """Call this before the first use of this instance. The method was already - called by the constructor in case sufficient information was provided. - - For more information no the parameters, see the __init__ method - :param path: if cursor is None the existing one will be used. - :return: True if the buffer can be used""" - if cursor: - self._c = cursor - # END update our cursor - - # reuse existing cursors if possible - if self._c is not None and self._c.is_associated(): - res = self._c.use_region(offset, size, flags).is_valid() - if res: - # if given size is too large or default, we computer a proper size - # If its smaller, we assume the combination between offset and size - # as chosen by the user is correct and use it ! - # If not, the user is in trouble. - if size > self._c.file_size(): - size = self._c.file_size() - offset - # END handle size - self._size = size - # END set size - return res - # END use our cursor - return False - - def end_access(self): + def close(self): """Call this method once you are done using the instance. It is automatically called on destruction, and should be called just in time to allow system resources to be freed. - Once you called end_access, you must call begin access before reusing this instance!""" - self._size = 0 - if self._c is not None: + Once you called close, you must call begin access before reusing this instance!""" + if self._c: self._c.unuse_region() - # END unuse region + self._c = None + self._size = 0 def cursor(self): """:return: the currently set cursor which provides access to the data""" diff --git a/smmap/mman.py b/smmap/mman.py index 9df69ed..1478836 100644 --- a/smmap/mman.py +++ b/smmap/mman.py @@ -1,5 +1,10 @@ """Module containing a memory memory manager which provides a sliding window on a number of memory mapped files""" +from functools import reduce +import logging +import sys + from .util import ( + PY3, MapWindow, MapRegion, MapRegionList, @@ -7,13 +12,12 @@ string_types, buffer, ) +import gc -import sys -from functools import reduce -__all__ = ["StaticWindowMapManager", "SlidingWindowMapManager", "WindowCursor"] +__all__ = ['managed_mmaps', "StaticWindowMapManager", "SlidingWindowMapManager", "WindowCursor"] #{ Utilities - +log = logging.getLogger(__name__) #}END utilities @@ -25,9 +29,15 @@ class WindowCursor(object): Cursors should not be created manually, but are instead returned by the SlidingWindowMapManager - **Note:**: The current implementation is suited for static and sliding window managers, but it also means - that it must be suited for the somewhat quite different sliding manager. It could be improved, but - I see no real need to do so.""" + .. Tip:: + This is a re-entrant, but not thread-safe context-manager, to be used within a ``with ...:`` block, + to ensure any left-overs cursors are cleaned up. If not entered, :meth:`use_region()`` + will scream. + + .. Note:: + The current implementation is suited for static and sliding window managers, + but it also means that it must be suited for the somewhat quite different sliding manager. + It could be improved, but I see no real need to do so.""" __slots__ = ( '_manager', # the manger keeping all file regions '_rlist', # a regions list with regions for our file @@ -43,9 +53,6 @@ def __init__(self, manager=None, regions=None): self._ofs = 0 self._size = 0 - def __del__(self): - self._destroy() - def __enter__(self): return self @@ -113,6 +120,8 @@ def use_region(self, offset=0, size=0, flags=0): **Note:**: The size actually mapped may be smaller than the given size. If that is the case, either the file has reached its end, or the map was created between two existing regions""" + self._manager._check_if_entered() + need_region = True man = self._manager fsize = self._rlist.file_size() @@ -136,7 +145,7 @@ def use_region(self, offset=0, size=0, flags=0): self._region.increment_client_count() # END need region handling - self._ofs = offset - self._region._b + self._ofs = offset - self._region._ofs self._size = min(size, self._region.ofs_end() - offset) return self @@ -182,12 +191,12 @@ def ofs_begin(self): """:return: offset to the first byte pointed to by our cursor **Note:** only if is_valid() is True""" - return self._region._b + self._ofs + return self._region._ofs + self._ofs def ofs_end(self): """:return: offset to one past the last available byte""" # unroll method calls for performance ! - return self._region._b + self._ofs + self._size + return self._region._ofs + self._ofs + self._size def size(self): """:return: amount of bytes we point to""" @@ -204,7 +213,7 @@ def includes_ofs(self, ofs): **Note:** cursor must be valid for this to work""" # unroll methods - return (self._region._b + self._ofs) <= ofs < (self._region._b + self._ofs + self._size) + return (self._region._ofs + self._ofs) <= ofs < (self._region._ofs + self._ofs + self._size) def file_size(self): """:return: size of the underlying file""" @@ -235,6 +244,25 @@ def fd(self): #} END interface +def managed_mmaps(check_entered=True): + """Makes a memory-map context-manager instance for the correct python-version. + + :param bool check_entered: + whether to scream if not used as context-manager (`with` block) + :return: + either :class:`SlidingWindowMapManager` or :class:`StaticWindowMapManager` (if PY2) + + If you want to change other default parameters of these classes, use them directly. + + .. Tip:: + Use it in a ``with ...:`` block, to free cached (and unused) resources. + + """ + mman = SlidingWindowMapManager if PY3 else StaticWindowMapManager + + return mman(check_entered=check_entered) + + class StaticWindowMapManager(object): """Provides a manager which will produce single size cursors that are allowed @@ -246,15 +274,24 @@ class StaticWindowMapManager(object): These clients would have to use a SlidingWindowMapBuffer to hide this fact. This type will always use a maximum window size, and optimize certain methods to - accommodate this fact""" + accommodate this fact + + .. Tip:: + The *memory-managers* are re-entrant, but not thread-safe context-manager(s), + to be used within a ``with ...:`` block, ensuring any left-overs cursors are cleaned up. + If not entered, :meth:`make_cursor()` and/or :meth:`WindowCursor.use_region()` will scream. + + """ __slots__ = [ - '_fdict', # mapping of path -> StorageHelper (of some kind - '_window_size', # maximum size of a window - '_max_memory_size', # maximum amount of memory we may allocate - '_max_handle_count', # maximum amount of handles to keep open - '_memory_size', # currently allocated memory size + '_fdict', # mapping of path -> StorageHelper (of some kind + '_window_size', # maximum size of a window + '_max_memory_size', # maximum amount of memory we may allocate + '_max_handle_count', # maximum amount of handles to keep open + '_memory_size', # currently allocated memory size '_handle_count', # amount of currently allocated file handles + '_entered', # updated on enter/exit, when 0, `close()` + 'check_entered', # bool, whether to scream if not used as context-manager (`with` block) ] #{ Configuration @@ -266,7 +303,8 @@ class StaticWindowMapManager(object): _MB_in_bytes = 1024 * 1024 - def __init__(self, window_size=0, max_memory_size=0, max_open_handles=sys.maxsize): + def __init__(self, window_size=0, max_memory_size=0, max_open_handles=sys.maxsize, + check_entered=True): """initialize the manager with the given parameters. :param window_size: if -1, a default window size will be chosen depending on the operating system's architecture. It will internally be quantified to a multiple of the page size @@ -276,13 +314,17 @@ def __init__(self, window_size=0, max_memory_size=0, max_open_handles=sys.maxsiz It is a soft limit that is tried to be kept, but nothing bad happens if we have to over-allocate :param max_open_handles: if not maxint, limit the amount of open file handles to the given number. Otherwise the amount is only limited by the system itself. If a system or soft limit is hit, - the manager will free as many handles as possible""" + the manager will free as many handles as possible + :param bool check_entered: whether to scream if not used as context-manager (`with` block) + """ self._fdict = dict() self._window_size = window_size self._max_memory_size = max_memory_size self._max_handle_count = max_open_handles self._memory_size = 0 self._handle_count = 0 + self._entered = 0 + self.check_entered = check_entered if window_size < 0: coeff = 64 @@ -300,6 +342,31 @@ def __init__(self, window_size=0, max_memory_size=0, max_open_handles=sys.maxsiz self._max_memory_size = coeff * self._MB_in_bytes # END handle max memory size + def __enter__(self): + assert self._entered >= 0, self._entered + self._entered += 1 + + return self + + def __exit__(self, exc_type, exc_value, traceback): + assert self._entered > 0, self._entered + self._entered -= 1 + if self._entered == 0: + # Try to close all file-handles + #(a *Windows* only issue, and probably not fixed) + gc.collect() + leaft_overs = self.collect() + if leaft_overs: + log.debug("Cleaned up %s left-over mmap-regions." % leaft_overs) + + def __del__(self): + if self._entered != 0: + log.warning("Missed %s exit(s) on %s!" % (self._entered, self)) + self.close() + + def close(self): + self.collect() + #{ Internal Methods def _collect_lru_region(self, size): @@ -322,9 +389,11 @@ def _collect_lru_region(self, size): lru_list = None for regions in self._fdict.values(): for region in regions: - # check client count - if it's 1, it's just us + ## Check client count - if it's 1, it's just us. + # if (region.client_count() == 1 and - (lru_region is None or region._uc < lru_region._uc)): + (lru_region is None or + region.client_count() < lru_region.client_count())): lru_region = region lru_list = regions # END update lru_region @@ -381,6 +450,10 @@ def _obtain_region(self, a, offset, size, flags, is_recursive): assert r.includes_ofs(offset) return r + def _check_if_entered(self): + if self.check_entered and self._entered <= 0: + raise ValueError('Context-manager %s not entered!' % self) + #}END internal methods #{ Interface @@ -400,8 +473,12 @@ def make_cursor(self, path_or_fd): **Note:** Using file descriptors directly is faster once new windows are mapped as it prevents the file to be opened again just for the purpose of mapping it.""" + self._check_if_entered() + regions = self._fdict.get(path_or_fd) - if regions is None: + if regions: + assert not regions.collect_closed_regions(), regions.collect_closed_regions() + else: regions = self.MapRegionListCls(path_or_fd) self._fdict[path_or_fd] = regions # END obtain region for path @@ -484,11 +561,12 @@ class SlidingWindowMapManager(StaticWindowMapManager): a safe amount of memory already, which would possibly cause memory allocations to fail as our address space is full.""" - __slots__ = tuple() + __slots__ = () - def __init__(self, window_size=-1, max_memory_size=0, max_open_handles=sys.maxsize): + def __init__(self, window_size=-1, max_memory_size=0, max_open_handles=sys.maxsize, + check_entered=True): """Adjusts the default window size to -1""" - super(SlidingWindowMapManager, self).__init__(window_size, max_memory_size, max_open_handles) + super(SlidingWindowMapManager, self).__init__(window_size, max_memory_size, max_open_handles, check_entered) def _obtain_region(self, a, offset, size, flags, is_recursive): # bisect to find an existing region. The c++ implementation cannot @@ -498,7 +576,7 @@ def _obtain_region(self, a, offset, size, flags, is_recursive): hi = len(a) while lo < hi: mid = (lo + hi) // 2 - ofs = a[mid]._b + ofs = a[mid]._ofs if ofs <= offset: if a[mid].includes_ofs(offset): r = a[mid] @@ -527,14 +605,14 @@ def _obtain_region(self, a, offset, size, flags, is_recursive): insert_pos = 0 len_regions = len(a) if len_regions == 1: - if a[0]._b <= offset: + if a[0]._ofs <= offset: insert_pos = 1 # END maintain sort else: # find insert position insert_pos = len_regions for i, region in enumerate(a): - if region._b > offset: + if region._ofs > offset: insert_pos = i break # END if insert position is correct diff --git a/smmap/test/test_buf.py b/smmap/test/test_buf.py index 3b6009e..39043f6 100644 --- a/smmap/test/test_buf.py +++ b/smmap/test/test_buf.py @@ -25,104 +25,97 @@ class TestBuf(TestBase): def test_basics(self): + # invalid paths fail upon construction + with FileCreator(self.k_window_test_size, "buffer_test") as fc: + with man_optimal: + with man_optimal.make_cursor(fc.path) as c: + self.assertRaises(ValueError, SlidingWindowMapBuffer, type(c)()) # invalid cursor + self.assertRaises(ValueError, SlidingWindowMapBuffer, c, fc.size) # offset too large + + offset = 100 + with SlidingWindowMapBuffer(c, offset) as buf: + assert buf.cursor() + assert buf.cursor().is_valid() + self.assertEqual(len(buf), fc.size - offset) + + with SlidingWindowMapBuffer(c, fc.size - offset) as buf: + assert buf.cursor() + assert buf.cursor().is_valid() + self.assertEqual(len(buf), offset) + + with SlidingWindowMapBuffer(c) as buf: + assert buf.cursor() + assert buf.cursor().is_valid() + self.assertEqual(len(buf), fc.size) + + # simple access + with open(fc.path, 'rb') as fp: + data = fp.read() + self.assertEqual(data[offset], buf[0]) + self.assertEqual(data[offset:offset * 2], buf[0:offset]) + + # negative indices, partial slices + self.assertEqual(buf[-1], buf[len(buf) - 1]) + self.assertEqual(buf[-10:], buf[len(buf) - 10:len(buf)]) + # end access makes its cursor invalid + assert not buf.cursor() + assert not c.is_valid() + assert c.is_associated() # but it remains associated + + self.assertEqual(man_optimal.num_file_handles(), 1) + + def test_performance(self): + # PERFORMANCE + # blast away with random access and a full mapping - we don't want to + # exaggerate the manager's overhead, but measure the buffer overhead + # We do it once with an optimal setting, and with a worse manager which + # will produce small mappings only ! with FileCreator(self.k_window_test_size, "buffer_test") as fc: - - # invalid paths fail upon construction - c = man_optimal.make_cursor(fc.path) - self.assertRaises(ValueError, SlidingWindowMapBuffer, type(c)()) # invalid cursor - self.assertRaises(ValueError, SlidingWindowMapBuffer, c, fc.size) # offset too large - - buf = SlidingWindowMapBuffer() # can create uninitailized buffers - assert buf.cursor() is None - - # can call end access any time - buf.end_access() - buf.end_access() - assert len(buf) == 0 - - # begin access can revive it, if the offset is suitable - offset = 100 - assert buf.begin_access(c, fc.size) == False - assert buf.begin_access(c, offset) == True - assert len(buf) == fc.size - offset - assert buf.cursor().is_valid() - - # empty begin access keeps it valid on the same path, but alters the offset - assert buf.begin_access() == True - assert len(buf) == fc.size - assert buf.cursor().is_valid() - - # simple access with open(fc.path, 'rb') as fp: data = fp.read() - assert data[offset] == buf[0] - assert data[offset:offset * 2] == buf[0:offset] - - # negative indices, partial slices - assert buf[-1] == buf[len(buf) - 1] - assert buf[-10:] == buf[len(buf) - 10:len(buf)] - - # end access makes its cursor invalid - buf.end_access() - assert not buf.cursor().is_valid() - assert buf.cursor().is_associated() # but it remains associated - - # an empty begin access fixes it up again - assert buf.begin_access() == True and buf.cursor().is_valid() - del(buf) # ends access automatically - del(c) - - assert man_optimal.num_file_handles() == 1 - - # PERFORMANCE - # blast away with random access and a full mapping - we don't want to - # exaggerate the manager's overhead, but measure the buffer overhead - # We do it once with an optimal setting, and with a worse manager which - # will produce small mappings only ! + max_num_accesses = 100 fd = os.open(fc.path, os.O_RDONLY) - for item in (fc.path, fd): - for manager, man_id in ((man_optimal, 'optimal'), - (man_worst_case, 'worst case'), - (static_man, 'static optimal')): - buf = SlidingWindowMapBuffer(manager.make_cursor(item)) - assert manager.num_file_handles() == 1 - for access_mode in range(2): # single, multi - num_accesses_left = max_num_accesses - num_bytes = 0 - fsize = fc.size - - st = time() - buf.begin_access() - while num_accesses_left: - num_accesses_left -= 1 - if access_mode: # multi - ofs_start = randint(0, fsize) - ofs_end = randint(ofs_start, fsize) - d = buf[ofs_start:ofs_end] - assert len(d) == ofs_end - ofs_start - assert d == data[ofs_start:ofs_end] - num_bytes += len(d) - del d - else: - pos = randint(0, fsize) - assert buf[pos] == data[pos] - num_bytes += 1 - # END handle mode - # END handle num accesses - - buf.end_access() - assert manager.num_file_handles() - assert manager.collect() - assert manager.num_file_handles() == 0 - elapsed = max(time() - st, 0.001) # prevent zero division errors on windows - mb = float(1000 * 1000) - mode_str = (access_mode and "slice") or "single byte" - print("%s: Made %i random %s accesses to buffer created from %s reading a total of %f mb in %f s (%f mb/s)" - % (man_id, max_num_accesses, mode_str, type(item), num_bytes / mb, elapsed, (num_bytes / mb) / elapsed), - file=sys.stderr) - # END handle access mode - del buf - # END for each manager - # END for each input - os.close(fd) + try: + for item in (fc.path, fd): + for manager, man_id in ((man_optimal, 'optimal'), + (man_worst_case, 'worst case'), + (static_man, 'static optimal')): + with manager: + for access_mode in range(2): # single, multi + with SlidingWindowMapBuffer(manager.make_cursor(item)) as buf: + self.assertEqual(manager.num_file_handles(), 1) + num_accesses_left = max_num_accesses + num_bytes = 0 + fsize = fc.size + + st = time() + while num_accesses_left: + num_accesses_left -= 1 + if access_mode: # multi + ofs_start = randint(0, fsize) + ofs_end = randint(ofs_start, fsize) + d = buf[ofs_start:ofs_end] + self.assertEqual(len(d), ofs_end - ofs_start) + self.assertEqual(d, data[ofs_start:ofs_end]) + num_bytes += len(d) + del d + else: + pos = randint(0, fsize) + self.assertEqual(buf[pos], data[pos]) + num_bytes += 1 + # END handle mode + # END handle num accesses + assert manager.num_file_handles() + assert manager.collect() + self.assertEqual(manager.num_file_handles(), 0) + elapsed = max(time() - st, 0.001) # prevent zero division errors on windows + mb = float(1000 * 1000) + mode_str = (access_mode and "slice") or "single byte" + print("%s: Made %i random %s accesses to buffer created from %s " + "reading a total of %f mb in %f s (%f mb/s)" + % (man_id, max_num_accesses, mode_str, type(item), + num_bytes / mb, elapsed, (num_bytes / mb) / elapsed), + file=sys.stderr) + finally: + os.close(fd) diff --git a/smmap/test/test_mman.py b/smmap/test/test_mman.py index 96bc355..469ab0e 100644 --- a/smmap/test/test_mman.py +++ b/smmap/test/test_mman.py @@ -20,80 +20,81 @@ class TestMMan(TestBase): def test_cursor(self): with FileCreator(self.k_window_test_size, "cursor_test") as fc: - man = SlidingWindowMapManager() - ci = WindowCursor(man) # invalid cursor - assert not ci.is_valid() + with SlidingWindowMapManager() as man: + ci = WindowCursor(man) # invalid cursor + assert not ci.is_valid() + assert not ci.is_associated() + self.assertEqual(ci.size(), 0) # this is cached, so we can query it in invalid state + + cv = man.make_cursor(fc.path) + assert not cv.is_valid() # no region mapped yet + assert cv.is_associated() # but it know where to map it from + self.assertEqual(cv.file_size(), fc.size) + self.assertEqual(cv.path(), fc.path) + + # copy module + cio = copy(cv) + assert not cio.is_valid() and cio.is_associated() + + # assign method assert not ci.is_associated() - assert ci.size() == 0 # this is cached, so we can query it in invalid state + ci.assign(cv) + assert not ci.is_valid() and ci.is_associated() - cv = man.make_cursor(fc.path) - assert not cv.is_valid() # no region mapped yet - assert cv.is_associated() # but it know where to map it from - assert cv.file_size() == fc.size - assert cv.path() == fc.path + # unuse non-existing region is fine + cv.unuse_region() + cv.unuse_region() - # copy module - cio = copy(cv) - assert not cio.is_valid() and cio.is_associated() - - # assign method - assert not ci.is_associated() - ci.assign(cv) - assert not ci.is_valid() and ci.is_associated() - - # unuse non-existing region is fine - cv.unuse_region() - cv.unuse_region() - - # destruction is fine (even multiple times) - cv._destroy() - WindowCursor(man)._destroy() + # destruction is fine (even multiple times) + cv._destroy() + WindowCursor(man)._destroy() def test_memory_manager(self): slide_man = SlidingWindowMapManager() static_man = StaticWindowMapManager() for man in (static_man, slide_man): - assert man.num_file_handles() == 0 - assert man.num_open_files() == 0 - winsize_cmp_val = 0 - if isinstance(man, StaticWindowMapManager): - winsize_cmp_val = -1 - # END handle window size - assert man.window_size() > winsize_cmp_val - assert man.mapped_memory_size() == 0 - assert man.max_mapped_memory_size() > 0 - - # collection doesn't raise in 'any' mode - man._collect_lru_region(0) - # doesn't raise if we are within the limit - man._collect_lru_region(10) - - # doesn't fail if we over-allocate - assert man._collect_lru_region(sys.maxsize) == 0 - - # use a region, verify most basic functionality - with FileCreator(self.k_window_test_size, "manager_test") as fc: - fd = os.open(fc.path, os.O_RDONLY) - try: - for item in (fc.path, fd): - c = man.make_cursor(item) - assert c.path_or_fd() is item - assert c.use_region(10, 10).is_valid() - assert c.ofs_begin() == 10 - assert c.size() == 10 - with open(fc.path, 'rb') as fp: - assert c.buffer()[:] == fp.read(20)[10:] - - if isinstance(item, int): - self.assertRaises(ValueError, c.path) - else: - self.assertRaises(ValueError, c.fd) - # END handle value error - # END for each input - finally: - os.close(fd) - # END for each manasger type + with man: + self.assertEqual(man.num_file_handles(), 0) + self.assertEqual(man.num_open_files(), 0) + winsize_cmp_val = 0 + if isinstance(man, StaticWindowMapManager): + winsize_cmp_val = -1 + # END handle window size + assert man.window_size() > winsize_cmp_val + self.assertEqual(man.mapped_memory_size(), 0) + assert man.max_mapped_memory_size() > 0 + + # collection doesn't raise in 'any' mode + man._collect_lru_region(0) + # doesn't raise if we are within the limit + man._collect_lru_region(10) + + # doesn't fail if we over-allocate + self.assertEqual(man._collect_lru_region(sys.maxsize), 0) + + # use a region, verify most basic functionality + with FileCreator(self.k_window_test_size, "manager_test") as fc: + fd = os.open(fc.path, os.O_RDONLY) + try: + for item in (fc.path, fd): + c = man.make_cursor(item) + assert c.path_or_fd() is item + assert c.use_region(10, 10).is_valid() + self.assertEqual(c.ofs_begin(), 10) + self.assertEqual(c.size(), 10) + with open(fc.path, 'rb') as fp: + self.assertEqual(c.buffer()[:], fp.read(20)[10:]) + + if isinstance(item, int): + self.assertRaises(ValueError, c.path) + else: + self.assertRaises(ValueError, c.fd) + # END handle value error + # END for each input + finally: + os.close(fd) + # END for each manager type def test_memman_operation(self): # test more access, force it to actually unmap regions @@ -107,120 +108,122 @@ def test_memman_operation(self): for mtype, args in ((StaticWindowMapManager, (0, fc.size // 3, max_num_handles)), (SlidingWindowMapManager, (fc.size // 100, fc.size // 3, max_num_handles)),): for item in (fc.path, fd): - assert len(data) == fc.size + self.assertEqual(len(data), fc.size) # small windows, a reasonable max memory. Not too many regions at once - man = mtype(window_size=args[0], max_memory_size=args[1], max_open_handles=args[2]) - c = man.make_cursor(item) - - # still empty (more about that is tested in test_memory_manager() - assert man.num_open_files() == 0 - assert man.mapped_memory_size() == 0 - - base_offset = 5000 - # window size is 0 for static managers, hence size will be 0. We take that into consideration - size = man.window_size() // 2 - assert c.use_region(base_offset, size).is_valid() - rr = c.region() - assert rr.client_count() == 2 # the manager and the cursor and us - - assert man.num_open_files() == 1 - assert man.num_file_handles() == 1 - assert man.mapped_memory_size() == rr.size() - - # assert c.size() == size # the cursor may overallocate in its static version - assert c.ofs_begin() == base_offset - assert rr.ofs_begin() == 0 # it was aligned and expanded - if man.window_size(): - # but isn't larger than the max window (aligned) - assert rr.size() == align_to_mmap(man.window_size(), True) - else: - assert rr.size() == fc.size - # END ignore static managers which dont use windows and are aligned to file boundaries - - assert c.buffer()[:] == data[base_offset:base_offset + (size or c.size())] - - # obtain second window, which spans the first part of the file - it is a still the same window - nsize = (size or fc.size) - 10 - assert c.use_region(0, nsize).is_valid() - assert c.region() == rr - assert man.num_file_handles() == 1 - assert c.size() == nsize - assert c.ofs_begin() == 0 - assert c.buffer()[:] == data[:nsize] - - # map some part at the end, our requested size cannot be kept - overshoot = 4000 - base_offset = fc.size - (size or c.size()) + overshoot - assert c.use_region(base_offset, size).is_valid() - if man.window_size(): - assert man.num_file_handles() == 2 - assert c.size() < size - assert c.region() is not rr # old region is still available, but has not curser ref anymore - assert rr.client_count() == 1 # only held by manager - else: - assert c.size() < fc.size - # END ignore static managers which only have one handle per file - rr = c.region() - assert rr.client_count() == 2 # manager + cursor - assert rr.ofs_begin() < c.ofs_begin() # it should have extended itself to the left - assert rr.ofs_end() <= fc.size # it cannot be larger than the file - assert c.buffer()[:] == data[base_offset:base_offset + (size or c.size())] - - # unising a region makes the cursor invalid - c.unuse_region() - assert not c.is_valid() - if man.window_size(): - # but doesn't change anything regarding the handle count - we cache it and only - # remove mapped regions if we have to - assert man.num_file_handles() == 2 - # END ignore this for static managers - - # iterate through the windows, verify data contents - # this will trigger map collection after a while - max_random_accesses = 5000 - num_random_accesses = max_random_accesses - memory_read = 0 - st = time() - - # cache everything to get some more performance - includes_ofs = c.includes_ofs - max_mapped_memory_size = man.max_mapped_memory_size() - max_file_handles = man.max_file_handles() - mapped_memory_size = man.mapped_memory_size - num_file_handles = man.num_file_handles - while num_random_accesses: - num_random_accesses -= 1 - base_offset = randint(0, fc.size - 1) - - # precondition + with mtype(window_size=args[0], max_memory_size=args[1], max_open_handles=args[2]) as man: + c = man.make_cursor(item) + + # still empty (more about that is tested in test_memory_manager() + self.assertEqual(man.num_open_files(), 0) + self.assertEqual(man.mapped_memory_size(), 0) + + base_offset = 5000 + # window size is 0 for static managers, hence size will be 0. We take that into consideration + size = man.window_size() // 2 + assert c.use_region(base_offset, size).is_valid() + rr = c.region() + self.assertEqual(rr.client_count(), 2) # the manager and the cursor and us + + self.assertEqual(man.num_open_files(), 1) + self.assertEqual(man.num_file_handles(), 1) + self.assertEqual(man.mapped_memory_size(), rr.size()) + + # self.assertEqual(c.size(), size # the cursor may overallocate in its static version) + self.assertEqual(c.ofs_begin(), base_offset) + self.assertEqual(rr.ofs_begin(), 0) # it was aligned and expanded + if man.window_size(): + # but isn't larger than the max window (aligned) + self.assertEqual(rr.size(), align_to_mmap(man.window_size(), True)) + else: + self.assertEqual(rr.size(), fc.size) + # END ignore static managers which dont use windows and are aligned to file boundaries + + self.assertEqual(c.buffer()[:], data[base_offset:base_offset + (size or c.size())]) + + # obtain second window, which spans the first part of the file - it is a still the same window + nsize = (size or fc.size) - 10 + assert c.use_region(0, nsize).is_valid() + self.assertEqual(c.region(), rr) + self.assertEqual(man.num_file_handles(), 1) + self.assertEqual(c.size(), nsize) + self.assertEqual(c.ofs_begin(), 0) + self.assertEqual(c.buffer()[:], data[:nsize]) + + # map some part at the end, our requested size cannot be kept + overshoot = 4000 + base_offset = fc.size - (size or c.size()) + overshoot + assert c.use_region(base_offset, size).is_valid() + if man.window_size(): + self.assertEqual(man.num_file_handles(), 2) + assert c.size() < size + assert c.region() is not rr # old region is still available, but has not curser ref anymore + self.assertEqual(rr.client_count(), 1) # only held by manager + else: + assert c.size() < fc.size + # END ignore static managers which only have one handle per file + rr = c.region() + self.assertEqual(rr.client_count(), 2) # manager + cursor + assert rr.ofs_begin() < c.ofs_begin() # it should have extended itself to the left + assert rr.ofs_end() <= fc.size # it cannot be larger than the file + self.assertEqual(c.buffer()[:], data[base_offset:base_offset + (size or c.size())]) + + # unising a region makes the cursor invalid + c.unuse_region() + assert not c.is_valid() if man.window_size(): - assert max_mapped_memory_size >= mapped_memory_size() - # END statics will overshoot, which is fine - assert max_file_handles >= num_file_handles() - assert c.use_region(base_offset, (size or c.size())).is_valid() - csize = c.size() - assert c.buffer()[:] == data[base_offset:base_offset + csize] - memory_read += csize - - assert includes_ofs(base_offset) - assert includes_ofs(base_offset + csize - 1) - assert not includes_ofs(base_offset + csize) - # END while we should do an access - elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows - mb = float(1000 * 1000) - print("%s: Read %i mb of memory with %i random on cursor initialized with %s accesses in %fs (%f mb/s)\n" - % (mtype, memory_read / mb, max_random_accesses, type(item), elapsed, (memory_read / mb) / elapsed), - file=sys.stderr) - - # an offset as large as the size doesn't work ! - assert not c.use_region(fc.size, size).is_valid() - - # collection - it should be able to collect all - assert man.num_file_handles() - assert man.collect() - assert man.num_file_handles() == 0 - # END for each item - # END for each manager type + # but doesn't change anything regarding the handle count - we cache it and only + # remove mapped regions if we have to + self.assertEqual(man.num_file_handles(), 2) + # END ignore this for static managers + + # iterate through the windows, verify data contents + # this will trigger map collection after a while + max_random_accesses = 5000 + num_random_accesses = max_random_accesses + memory_read = 0 + st = time() + + # cache everything to get some more performance + includes_ofs = c.includes_ofs + max_mapped_memory_size = man.max_mapped_memory_size() + max_file_handles = man.max_file_handles() + mapped_memory_size = man.mapped_memory_size + num_file_handles = man.num_file_handles + while num_random_accesses: + num_random_accesses -= 1 + base_offset = randint(0, fc.size - 1) + + # precondition + if man.window_size(): + assert max_mapped_memory_size >= mapped_memory_size() + # END statics will overshoot, which is fine + assert max_file_handles >= num_file_handles() + assert c.use_region(base_offset, (size or c.size())).is_valid() + csize = c.size() + self.assertEqual(c.buffer()[:], data[base_offset:base_offset + csize]) + memory_read += csize + + assert includes_ofs(base_offset) + assert includes_ofs(base_offset + csize - 1) + assert not includes_ofs(base_offset + csize) + # END while we should do an access + elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows + mb = float(1000 * 1000) + print("%s: Read %i mb of memory with %i random on cursor " + "initialized with %s accesses in %fs (%f mb/s)\n" + % (mtype, memory_read / mb, max_random_accesses, + type(item), elapsed, (memory_read / mb) / elapsed), + file=sys.stderr) + + # an offset as large as the size doesn't work ! + assert not c.use_region(fc.size, size).is_valid() + + # collection - it should be able to collect all + assert man.num_file_handles() + assert man.collect() + self.assertEqual(man.num_file_handles(), 0) + # END for each item + # END for each manager type finally: os.close(fd) diff --git a/smmap/test/test_tutorial.py b/smmap/test/test_tutorial.py index b03db9b..0adec5d 100644 --- a/smmap/test/test_tutorial.py +++ b/smmap/test/test_tutorial.py @@ -22,60 +22,57 @@ def test_example(self): import smmap.test.lib with smmap.test.lib.FileCreator(1024 * 1024 * 8, "test_file") as fc: # obtain a cursor to access some file. - c = mman.make_cursor(fc.path) - - # the cursor is now associated with the file, but not yet usable - assert c.is_associated() - assert not c.is_valid() - - # before you can use the cursor, you have to specify a window you want to - # access. The following just says you want as much data as possible starting - # from offset 0. - # To be sure your region could be mapped, query for validity - assert c.use_region().is_valid() # use_region returns self - - # once a region was mapped, you must query its dimension regularly - # to assure you don't try to access its buffer out of its bounds - assert c.size() - c.buffer()[0] # first byte - c.buffer()[1:10] # first 9 bytes - c.buffer()[c.size() - 1] # last byte - - # its recommended not to create big slices when feeding the buffer - # into consumers (e.g. struct or zlib). - # Instead, either give the buffer directly, or use pythons buffer command. - from smmap.util import buffer - buffer(c.buffer(), 1, 9) # first 9 bytes without copying them - - # you can query absolute offsets, and check whether an offset is included - # in the cursor's data. - assert c.ofs_begin() < c.ofs_end() - assert c.includes_ofs(100) - - # If you are over out of bounds with one of your region requests, the - # cursor will be come invalid. It cannot be used in that state - assert not c.use_region(fc.size, 100).is_valid() - # map as much as possible after skipping the first 100 bytes - assert c.use_region(100).is_valid() - - # You can explicitly free cursor resources by unusing the cursor's region - c.unuse_region() - assert not c.is_valid() - - # Buffers - ######### - # Create a default buffer which can operate on the whole file - buf = smmap.SlidingWindowMapBuffer(mman.make_cursor(fc.path)) - - # you can use it right away - assert buf.cursor().is_valid() - - buf[0] # access the first byte - buf[-1] # access the last ten bytes on the file - buf[-10:] # access the last ten bytes - - # If you want to keep the instance between different accesses, use the - # dedicated methods - buf.end_access() - assert not buf.cursor().is_valid() # you cannot use the buffer anymore - assert buf.begin_access(offset=10) # start using the buffer at an offset + with mman: + c = mman.make_cursor(fc.path) + + # the cursor is now associated with the file, but not yet usable + assert c.is_associated() + assert not c.is_valid() + + # before you can use the cursor, you have to specify a window you want to + # access. The following just says you want as much data as possible starting + # from offset 0. + # To be sure your region could be mapped, query for validity + assert c.use_region().is_valid() # use_region returns self + + # once a region was mapped, you must query its dimension regularly + # to assure you don't try to access its buffer out of its bounds + assert c.size() + c.buffer()[0] # first byte + c.buffer()[1:10] # first 9 bytes + c.buffer()[c.size() - 1] # last byte + + # its recommended not to create big slices when feeding the buffer + # into consumers (e.g. struct or zlib). + # Instead, either give the buffer directly, or use pythons buffer command. + from smmap.util import buffer + buffer(c.buffer(), 1, 9) # first 9 bytes without copying them + + # you can query absolute offsets, and check whether an offset is included + # in the cursor's data. + assert c.ofs_begin() < c.ofs_end() + assert c.includes_ofs(100) + + # If you are over out of bounds with one of your region requests, the + # cursor will be come invalid. It cannot be used in that state + assert not c.use_region(fc.size, 100).is_valid() + # map as much as possible after skipping the first 100 bytes + assert c.use_region(100).is_valid() + + # You can explicitly free cursor resources by unusing the cursor's region + c.unuse_region() + assert not c.is_valid() + + # Buffers + ######### + # Create a default buffer which can operate on the whole file + with smmap.SlidingWindowMapBuffer(mman.make_cursor(fc.path)) as buf: + # you can use it right away + assert buf.cursor().is_valid() + + buf[0] # access the first byte + buf[-1] # access the last ten bytes on the file + buf[-10:] # access the last ten bytes + + assert not buf.cursor() + assert not c.is_valid() # you cannot use the buffer anymore diff --git a/smmap/util.py b/smmap/util.py index 02df41a..0628fc6 100644 --- a/smmap/util.py +++ b/smmap/util.py @@ -18,21 +18,23 @@ try: # Python 2 - buffer = buffer + buffer = buffer # @UndefinedVariable except NameError: # Python 3 has no `buffer`; only `memoryview` def buffer(obj, offset, size): - # Actually, for gitpython this is fastest ... . - return memoryview(obj)[offset:offset+size] - # doing it directly is much faster ! - # return obj[offset:offset + size] + # Actually, for gitpython this is fastest ... but `memoryviews` LEAK! + #return memoryview(obj)[offset:offset + size] + return obj[offset:offset + size] + + +PY3 = sys.version_info[0] >= 3 def string_types(): - if sys.version_info[0] >= 3: + if PY3: return str else: - return basestring + return basestring # @UndefinedVariable def align_to_mmap(num, round_up): @@ -63,7 +65,7 @@ class MapWindow(object): """Utility type which is used to snap windows towards each other, and to adjust their size""" __slots__ = ( 'ofs', # offset into the file in bytes - 'size' # size of the window in bytes + 'size' # size of the window in bytes ) def __init__(self, offset, size): @@ -76,7 +78,7 @@ def __repr__(self): @classmethod def from_region(cls, region): """:return: new window from a region""" - return cls(region._b, region.size()) + return cls(region._ofs, region.size()) def ofs_end(self): return self.ofs + self.size @@ -110,10 +112,10 @@ class MapRegion(object): **Note:** deallocates used region automatically on destruction""" __slots__ = [ - '_b', # beginning of mapping - '_mf', # mapped memory chunk (as returned by mmap) - '_uc', # total amount of usages + '_ofs', # beginning of mapping '_size', # cached size of our memory map + '_mf', # mapped memory chunk (as returned by mmap) + '_uc', # total amount of usages '__weakref__' ] _need_compat_layer = sys.version_info[:2] < (2, 6) @@ -133,7 +135,7 @@ def __init__(self, path_or_fd, ofs, size, flags=0): allocated the the size automatically adjusted :param flags: additional flags to be given when opening the file. :raise Exception: if no memory can be allocated""" - self._b = ofs + self._ofs = ofs self._size = 0 self._uc = 0 @@ -141,8 +143,6 @@ def __init__(self, path_or_fd, ofs, size, flags=0): fd = path_or_fd else: fd = os.open(path_or_fd, os.O_RDONLY | getattr(os, 'O_BINARY', 0) | flags) - # END handle fd - try: kwargs = dict(access=ACCESS_READ, offset=ofs) corrected_size = size @@ -166,7 +166,7 @@ def __init__(self, path_or_fd, ofs, size, flags=0): self._mfb = buffer(self._mf, ofs, self._size) # END handle buffer wrapping finally: - if isinstance(path_or_fd, string_types()): + if not isinstance(path_or_fd, int): os.close(fd) # END only close it if we opened it # END close file handle @@ -174,7 +174,7 @@ def __init__(self, path_or_fd, ofs, size, flags=0): self.increment_client_count() def __repr__(self): - return "MapRegion<%i, %i>" % (self._b, self.size()) + return "MapRegion<%i, %i>" % (self._ofs, self.size()) #{ Interface @@ -188,7 +188,7 @@ def map(self): def ofs_begin(self): """:return: absolute byte offset to the first byte of the mapping""" - return self._b + return self._ofs def size(self): """:return: total size of the mapped region in bytes""" @@ -196,17 +196,17 @@ def size(self): def ofs_end(self): """:return: Absolute offset to one byte beyond the mapping into the file""" - return self._b + self._size + return self._ofs + self._size def includes_ofs(self, ofs): """:return: True if the given offset can be read in our mapped region""" - return self._b <= ofs < self._b + self._size + return self._ofs <= ofs < self._ofs + self._size def client_count(self): """:return: number of clients currently using this region""" return self._uc - def increment_client_count(self, ofs = 1): + def increment_client_count(self, ofs=1): """Adjust the usage count by the given positive or negative offset. If usage count equals 0, we will auto-release our resources :return: True if we released resources, False otherwise. In the latter case, we can still be used""" @@ -227,7 +227,7 @@ def release(self): # re-define all methods which need offset adjustments in compatibility mode if _need_compat_layer: def size(self): - return self._size - self._b + return self._size - self._ofs def ofs_end(self): # always the size - we are as large as it gets @@ -237,7 +237,7 @@ def buffer(self): return self._mfb def includes_ofs(self, ofs): - return self._b <= ofs < self._size + return self._ofs <= ofs < self._size # END handle compat layer #} END interface @@ -273,4 +273,9 @@ def file_size(self): # END update file size return self._file_size + def collect_closed_regions(self): + """a PY3+ utility for assertions""" + # The `closed` attribute is PY3.2+ + return [region for region in self if getattr(region._mf, 'closed', None)] + #} END utility classes