Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/src/carquinyol/optimizer.py
blob: c038c2bc3df867584f9d33a722fa0d8710b21a8e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# Copyright (C) 2008, One Laptop Per Child
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

import os
import errno
import logging
import subprocess

import gobject

from carquinyol import layoutmanager


class Optimizer(object):
    """Optimizes disk space usage by detecting duplicates and sharing storage.
    """

    def __init__(self, file_store, metadata_store):
        self._file_store = file_store
        self._metadata_store = metadata_store
        self._enqueue_checksum_id = None

    def optimize(self, uid):
        """Add an entry to a queue of entries to be checked for duplicates.

        """
        if not os.path.exists(self._file_store.get_file_path(uid)):
            return

        queue_path = layoutmanager.get_instance().get_queue_path()
        open(os.path.join(queue_path, uid), 'w').close()
        logging.debug('optimize %r', os.path.join(queue_path, uid))

        if self._enqueue_checksum_id is None:
            self._enqueue_checksum_id = \
                    gobject.idle_add(self._process_entry_cb,
                                     priority=gobject.PRIORITY_LOW)

    def remove(self, uid):
        """Remove any structures left from space optimization

        """
        checksum = self._metadata_store.get_property(uid, 'checksum')
        if checksum is None:
            return

        checksums_dir = layoutmanager.get_instance().get_checksums_dir()
        checksum_path = os.path.join(checksums_dir, checksum)
        checksum_entry_path = os.path.join(checksum_path, uid)

        if os.path.exists(checksum_entry_path):
            logging.debug('remove %r', checksum_entry_path)
            os.remove(checksum_entry_path)

        if os.path.exists(checksum_path):
            try:
                os.rmdir(checksum_path)
                logging.debug('removed %r', checksum_path)
            except OSError, e:
                if e.errno != errno.ENOTEMPTY:
                    raise

    def _identical_file_already_exists(self, checksum):
        """Check if we already have files with this checksum.

        """
        checksums_dir = layoutmanager.get_instance().get_checksums_dir()
        checksum_path = os.path.join(checksums_dir, checksum)
        return os.path.exists(checksum_path)

    def _get_uid_from_checksum(self, checksum):
        """Get an existing entry which file matches checksum.

        """
        checksums_dir = layoutmanager.get_instance().get_checksums_dir()
        checksum_path = os.path.join(checksums_dir, checksum)
        first_uid = os.listdir(checksum_path)[0]
        return first_uid

    def _create_checksum_dir(self, checksum):
        """Create directory that tracks files with this same checksum.

        """
        checksums_dir = layoutmanager.get_instance().get_checksums_dir()
        checksum_path = os.path.join(checksums_dir, checksum)
        logging.debug('create dir %r', checksum_path)
        os.mkdir(checksum_path)

    def _add_checksum_entry(self, uid, checksum):
        """Create a file in the checksum dir with the uid of the entry

        """
        checksums_dir = layoutmanager.get_instance().get_checksums_dir()
        checksum_path = os.path.join(checksums_dir, checksum)

        logging.debug('touch %r', os.path.join(checksum_path, uid))
        open(os.path.join(checksum_path, uid), 'w').close()

    def _already_linked(self, uid, checksum):
        """Check if this entry's file is already a hard link to the checksums
           dir.

        """
        checksums_dir = layoutmanager.get_instance().get_checksums_dir()
        checksum_path = os.path.join(checksums_dir, checksum)
        return os.path.exists(os.path.join(checksum_path, uid))

    def _process_entry_cb(self):
        """Process one item in the checksums queue by calculating its checksum,
           checking if there exist already an identical file, and in that case
           substituting its file with a hard link to that pre-existing file.

        """
        queue_path = layoutmanager.get_instance().get_queue_path()
        queue = os.listdir(queue_path)
        if queue:
            uid = queue[0]
            logging.debug('_process_entry_cb processing %r', uid)

            file_in_entry_path = self._file_store.get_file_path(uid)
            if not os.path.exists(file_in_entry_path):
                logging.info('non-existent entry in queue: %r', uid)
            else:
                checksum = self._calculate_md5sum(file_in_entry_path)
                self._metadata_store.set_property(uid, 'checksum', checksum)

                if self._identical_file_already_exists(checksum):
                    if not self._already_linked(uid, checksum):
                        existing_entry_uid = \
                                self._get_uid_from_checksum(checksum)

                        self._file_store.hard_link_entry(uid,
                                                         existing_entry_uid)

                        self._add_checksum_entry(uid, checksum)
                else:
                    self._create_checksum_dir(checksum)
                    self._add_checksum_entry(uid, checksum)

            os.remove(os.path.join(queue_path, uid))

        if len(queue) <= 1:
            self._enqueue_checksum_id = None
            return False
        else:
            return True

    def _calculate_md5sum(self, path):
        """Calculate the md5 checksum of a given file.

        """
        popen = subprocess.Popen(['md5sum', path], stdout=subprocess.PIPE)
        stdout, __ = popen.communicate()
        return str(stdout).split(' ', 1)[0]