diff options
Diffstat (limited to 'datastore/src/carquinyol/optimizer.py')
-rw-r--r-- | datastore/src/carquinyol/optimizer.py | 167 |
1 files changed, 167 insertions, 0 deletions
diff --git a/datastore/src/carquinyol/optimizer.py b/datastore/src/carquinyol/optimizer.py new file mode 100644 index 0000000..2b6ce29 --- /dev/null +++ b/datastore/src/carquinyol/optimizer.py @@ -0,0 +1,167 @@ +# Copyright (C) 2008, One Laptop Per Child +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +import os +import errno +import logging +import subprocess + +import gobject + +from carquinyol import layoutmanager + + +class Optimizer(object): + """Optimizes disk space usage by detecting duplicates and sharing storage. + """ + + def __init__(self, file_store, metadata_store): + self._file_store = file_store + self._metadata_store = metadata_store + self._enqueue_checksum_id = None + + def optimize(self, uid): + """Add an entry to a queue of entries to be checked for duplicates. + + """ + if not os.path.exists(self._file_store.get_file_path(uid)): + return + + queue_path = layoutmanager.get_instance().get_queue_path() + open(os.path.join(queue_path, uid), 'w').close() + logging.debug('optimize %r', os.path.join(queue_path, uid)) + + if self._enqueue_checksum_id is None: + self._enqueue_checksum_id = \ + gobject.idle_add(self._process_entry_cb, + priority=gobject.PRIORITY_LOW) + + def remove(self, uid): + """Remove any structures left from space optimization + + """ + checksum = self._metadata_store.get_property(uid, 'checksum') + if checksum is None: + return + + checksums_dir = layoutmanager.get_instance().get_checksums_dir() + checksum_path = os.path.join(checksums_dir, checksum) + checksum_entry_path = os.path.join(checksum_path, uid) + + if os.path.exists(checksum_entry_path): + logging.debug('remove %r', checksum_entry_path) + os.remove(checksum_entry_path) + + if os.path.exists(checksum_path): + try: + os.rmdir(checksum_path) + logging.debug('removed %r', checksum_path) + except OSError, e: + if e.errno != errno.ENOTEMPTY: + raise + + def _identical_file_already_exists(self, checksum): + """Check if we already have files with this checksum. + + """ + checksums_dir = layoutmanager.get_instance().get_checksums_dir() + checksum_path = os.path.join(checksums_dir, checksum) + return os.path.exists(checksum_path) + + def _get_uid_from_checksum(self, checksum): + """Get an existing entry which file matches checksum. + + """ + checksums_dir = layoutmanager.get_instance().get_checksums_dir() + checksum_path = os.path.join(checksums_dir, checksum) + first_uid = os.listdir(checksum_path)[0] + return first_uid + + def _create_checksum_dir(self, checksum): + """Create directory that tracks files with this same checksum. + + """ + checksums_dir = layoutmanager.get_instance().get_checksums_dir() + checksum_path = os.path.join(checksums_dir, checksum) + logging.debug('create dir %r', checksum_path) + os.mkdir(checksum_path) + + def _add_checksum_entry(self, uid, checksum): + """Create a file in the checksum dir with the uid of the entry + + """ + checksums_dir = layoutmanager.get_instance().get_checksums_dir() + checksum_path = os.path.join(checksums_dir, checksum) + + logging.debug('touch %r', os.path.join(checksum_path, uid)) + open(os.path.join(checksum_path, uid), 'w').close() + + def _already_linked(self, uid, checksum): + """Check if this entry's file is already a hard link to the checksums + dir. + + """ + checksums_dir = layoutmanager.get_instance().get_checksums_dir() + checksum_path = os.path.join(checksums_dir, checksum) + return os.path.exists(os.path.join(checksum_path, uid)) + + def _process_entry_cb(self): + """Process one item in the checksums queue by calculating its checksum, + checking if there exist already an identical file, and in that case + substituting its file with a hard link to that pre-existing file. + + """ + queue_path = layoutmanager.get_instance().get_queue_path() + queue = os.listdir(queue_path) + if queue: + uid = queue[0] + logging.debug('_process_entry_cb processing %r', uid) + + file_in_entry_path = self._file_store.get_file_path(uid) + if not os.path.exists(file_in_entry_path): + logging.info('non-existent entry in queue: %r', uid) + else: + checksum = self._calculate_md5sum(file_in_entry_path) + self._metadata_store.set_property(uid, 'checksum', checksum) + + if self._identical_file_already_exists(checksum): + if not self._already_linked(uid, checksum): + existing_entry_uid = \ + self._get_uid_from_checksum(checksum) + + self._file_store.hard_link_entry(uid, + existing_entry_uid) + + self._add_checksum_entry(uid, checksum) + else: + self._create_checksum_dir(checksum) + self._add_checksum_entry(uid, checksum) + + os.remove(os.path.join(queue_path, uid)) + + if len(queue) <= 1: + self._enqueue_checksum_id = None + return False + else: + return True + + def _calculate_md5sum(self, path): + """Calculate the md5 checksum of a given file. + + """ + popen = subprocess.Popen(['md5sum', path], stdout=subprocess.PIPE) + stdout, stderr_ = popen.communicate() + return stdout.split(' ', 1)[0] |