00001
00002
00003
00004 import os, os.path, sys, string, shutil, tempfile, fileUtils, time
00005
00006
00007 __doc__ = """
00008 Usage: tree-file-compare.py [ -h | --help ] [ -v | --verbose ] [ --by-name-only ] --reference APath [--mirror AnotherPath]
00009 Will scan first specified tree, in search of duplicated files (same content, different path). The resulting associations will be stored in ~/*-tree-file-compare.log files. If a second tree is specified (--mirror option), then will look for files whose content is in second tree but not in the first one, to ensure the reference tree is complete.
00010
00011 This script is useful to ensure a reference tree does not lack any content from a mirror and to know whether the mirror is up-to-date.
00012 The script can be used for example for snapshots or archives.
00013
00014 Options:
00015 -v or --verbose: set verbose mode
00016 --by-name-only: comparison is done based on names only; no MD5 checksum performed (useful when the names refer clearly to the content, as an archive filename, as opposed to snapshots)
00017 --mirror A_PATH: specifies a second tree to compare with
00018 """
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041 log_file = None
00042
00043
00044 base_write_path = os.path.expanduser("~")
00045
00046 file_base_name = time.strftime( "%Y%m%d-tree-file-compare.log", time.gmtime() )
00047
00048 log_filename = os.path.join( base_write_path, file_base_name )
00049
00050
00051
00052 def output(message):
00053 print message
00054 log_file.write("%s\n" % message)
00055
00056
00057
00058 def build_file_index_for(path):
00059 """Creates two (dictionary-based) file index for specified path."""
00060
00061
00062
00063
00064 file_paths = fileUtils.getAllRelativeFilePathsFromRoot(path)
00065
00066 content_dict={}
00067 name_dict={}
00068
00069 for f in file_paths:
00070
00071 full_path = os.path.join(path,f)
00072
00073
00074 md5 = fileUtils.getMD5codeFor(full_path)
00075
00076 if content_dict.has_key(md5):
00077 content_dict[md5] += [f]
00078 else:
00079 content_dict[md5] = [f]
00080
00081
00082 name = os.path.basename(f)
00083
00084 if name_dict.has_key(name):
00085 name_dict[name] += [f]
00086 else:
00087 name_dict[name] = [f]
00088
00089
00090
00091
00092 return (content_dict,name_dict)
00093
00094
00095
00096 def build_name_index_for(path):
00097 """Creates one (dictionary-based) name index for specified path."""
00098
00099
00100
00101 file_paths = fileUtils.getAllRelativeFilePathsFromRoot(path)
00102
00103 name_dict={}
00104
00105 for f in file_paths:
00106
00107 full_path = os.path.join(path,f)
00108
00109
00110 name = os.path.basename(f)
00111
00112 if name_dict.has_key(name):
00113 name_dict[name] += [f]
00114 else:
00115 name_dict[name] = [f]
00116
00117
00118
00119 return name_dict
00120
00121
00122
00123 def display_content_duplicates(root_path,content_index):
00124 """Displays the duplicates in specified content file index."""
00125 output( "Displaying duplicated content in tree %s:" % (root_path,))
00126 for k in content_index.keys():
00127 file_list = content_index[k]
00128 if len(file_list) > 1:
00129
00130 output( " + identical content: %s." % (file_list,) )
00131 output("")
00132
00133
00134
00135 def display_name_duplicates(root_path,name_index):
00136 """Displays the duplicates in specified name file index."""
00137 output( "Displaying duplicated names in tree %s:" % (root_path,))
00138 for k in name_index.keys():
00139 file_list = name_index[k]
00140 if len(file_list) > 1:
00141
00142 output( " + duplicated names: %s." % (file_list,) )
00143 output("")
00144
00145
00146
00147 def compare_content_trees(ref_content_index,mirror_content_index):
00148 """Compares the reference and mirror trees, based on the file content. Useful to know whether a mirror is complete."""
00149 output("Comparing reference tree with mirror tree:")
00150 for k in ref_content_index.keys():
00151 ref_files = ref_content_index[k]
00152 if mirror_content_index.has_key(k):
00153 mirror_files = mirror_content_index[k]
00154 if mirror_files != ref_files:
00155
00156 output( " + identical content for %s in reference and %s in mirror." % (ref_files,mirror_files) )
00157 else:
00158
00159 output( " (content corresponding to %s is in reference but not in mirror)" % (ref_files,) )
00160 output("")
00161
00162
00163 def compare_name_trees(ref_name_index,mirror_name_index):
00164 """Compares the reference and mirror trees, based on the file name. Useful to know whether a mirror is complete."""
00165 output("Comparing reference tree with mirror tree:")
00166 for k in ref_name_index.keys():
00167 ref_files = ref_name_index[k]
00168 if mirror_name_index.has_key(k):
00169 mirror_files = mirror_name_index[k]
00170 if mirror_files != ref_files:
00171
00172 output( " + identical name for %s in reference and %s in mirror." % (ref_files,mirror_files) )
00173 else:
00174
00175 output( " (name corresponding to %s is in reference but not in mirror)" % (ref_files,) )
00176 output("")
00177
00178
00179
00180 def check_content_completeness(ref_content_index,mirror_content_index):
00181 """Checks that all content of mirror tree is in reference tree, preferably with the same filenames."""
00182 output("Checking completeness of reference regarding the mirror:")
00183 for k in mirror_content_index.keys():
00184 if not ref_content_index.has_key(k):
00185
00186 output( " + content corresponding to %s is in mirror but not in reference." % (mirror_content_index[k],) )
00187 output("")
00188
00189
00190
00191 def check_name_completeness(ref_name_index,mirror_name_index):
00192 """Checks that all name of mirror tree is in reference tree, preferably with the same filenames."""
00193 output("Checking completeness of reference regarding the mirror:")
00194 for k in mirror_name_index.keys():
00195 if not ref_name_index.has_key(k):
00196
00197 output( " + name corresponding to %s is in mirror but not in reference." % (mirror_name_index[k],) )
00198 output("")
00199
00200
00201
00202 def write_hashes(log_file,content_index):
00203 """Writes specified content index in specified log file."""
00204 log_file.write("Hashes:\n\n")
00205 for k in content_index.keys():
00206 log_file.write( " %s %s\n" % (k,content_index[k]))
00207 log_file.write("\n")
00208
00209
00210
00211
00212 if __name__ == '__main__':
00213
00214 help_options = [ '-h', '--help' ]
00215 verbose_options = [ '-v', '--verbose' ]
00216 by_name_options = [ '--by-name-only' ]
00217
00218 options = help_options + verbose_options + by_name_options
00219
00220
00221 verbose = False
00222 compare_by_content = True
00223
00224
00225
00226 saved_args = sys.argv[1:]
00227
00228
00229 sys.argv.pop(0)
00230
00231 item_count = 0
00232
00233 reference_path = None
00234 mirror_path = None
00235
00236 while len(sys.argv):
00237
00238 item = sys.argv.pop(0)
00239 item_understood = False
00240
00241
00242 item_count += 1
00243
00244 if item in help_options:
00245 item_understood = True
00246 print __doc__
00247 sys.exit( 0 )
00248
00249 if item == "--reference":
00250 item_understood = True
00251 reference_path = sys.argv.pop(0)
00252
00253
00254 if item == "--mirror":
00255 item_understood = True
00256 mirror_path = sys.argv.pop(0)
00257
00258
00259 if item in verbose_options:
00260 item_understood = True
00261 verbose = True
00262 print "Verbose mode activated."
00263
00264 if item in by_name_options:
00265 item_understood = True
00266 compare_by_content = False
00267 print "Comparison will be based on names only, rather than on content too."
00268
00269 if not item_understood:
00270 print "Error, unexpected parameter: %s, stopping." % ( item, )
00271 print __doc__
00272 sys.exit( 1 )
00273
00274 if verbose:
00275 print "Reference path = %s" % ( reference_path )
00276 print "Mirror path = %s" % ( mirror_path )
00277
00278 if not reference_path:
00279 print "Error, no reference path given, stopping."
00280 print __doc__
00281 sys.exit( 2 )
00282
00283
00284 log_file = open(log_filename,"w")
00285
00286 log_file.write( "Report generated on %s.\n" % ( time.strftime("%a, %d %B %Y %H:%M:%S", time.gmtime()),) )
00287
00288 log_file.write( "Arguments specified: %s" % (saved_args,) )
00289
00290 print "Scanning reference tree..."
00291
00292 if compare_by_content:
00293 (ref_content_index,ref_name_index) = build_file_index_for( reference_path )
00294 log_file.write("\n\n ***** For reference tree %s *****\n\n" % (reference_path,))
00295 display_content_duplicates(reference_path,ref_content_index)
00296 display_name_duplicates(reference_path,ref_name_index)
00297 write_hashes(log_file,ref_content_index)
00298 else:
00299 ref_name_index = build_name_index_for( reference_path )
00300 log_file.write("\n\n ***** For reference tree %s *****\n\n" % (reference_path,))
00301 display_name_duplicates(reference_path,ref_name_index)
00302
00303 if mirror_path:
00304 log_file.write("\n\n ***** For mirror tree %s *****\n\n" % (mirror_path,))
00305 print "Scanning mirror tree..."
00306 if compare_by_content:
00307 (mirror_content_index,mirror_name_index) = build_file_index_for( mirror_path )
00308 display_content_duplicates(mirror_path,mirror_content_index)
00309 display_name_duplicates(mirror_path,mirror_name_index)
00310 write_hashes(log_file,mirror_content_index)
00311 compare_content_trees( ref_content_index,
00312 mirror_content_index )
00313 check_content_completeness( ref_content_index,
00314 mirror_content_index )
00315 else:
00316 mirror_name_index = build_name_index_for( mirror_path )
00317 display_name_duplicates(mirror_path,mirror_name_index)
00318
00319
00320
00321 check_name_completeness( ref_name_index,
00322 mirror_name_index )
00323
00324 log_file.write("\n\n ***** Tree comparison *****\n\n")
00325
00326 log_file.close()
00327