summaryrefslogtreecommitdiff
path: root/utils/common/gitscraper.py
diff options
context:
space:
mode:
Diffstat (limited to 'utils/common/gitscraper.py')
-rwxr-xr-xutils/common/gitscraper.py183
1 files changed, 183 insertions, 0 deletions
diff --git a/utils/common/gitscraper.py b/utils/common/gitscraper.py
new file mode 100755
index 0000000000..49ef42de13
--- /dev/null
+++ b/utils/common/gitscraper.py
@@ -0,0 +1,183 @@
1#!/usr/bin/python
2# __________ __ ___.
3# Open \______ \ ____ ____ | | _\_ |__ _______ ___
4# Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5# Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6# Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
7# \/ \/ \/ \/ \/
8#
9# Copyright (c) 2012 Dominik Riebeling
10#
11# All files in this archive are subject to the GNU General Public License.
12# See the file COPYING in the source tree root for full license agreement.
13#
14# This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
15# KIND, either express or implied.
16#
17
18'''Scrape files from a git repository.
19
20This module provides functions to get a subset of files from a git repository.
21The files to retrieve can be specified, and the git tree to work on can be
22specified. That was arbitrary trees can be retrieved (like a subset of files
23for a given tag).
24
25Retrieved files can be packaged into a bzip2 compressed tarball or stored in a
26given folder for processing afterwards.
27
28Calls git commands directly for maximum compatibility.
29'''
30
31import re
32import subprocess
33import os
34import tarfile
35import tempfile
36import shutil
37
38
39def get_refs(repo):
40 '''Get dict matching refs to hashes from repository pointed to by repo.
41 @param repo Path to repository root.
42 @return Dict matching hashes to each ref.
43 '''
44 print "Getting list of refs"
45 output = subprocess.Popen(["git", "show-ref"], stdout=subprocess.PIPE,
46 stderr=subprocess.PIPE, cwd=repo)
47 cmdout = output.communicate()
48 refs = {}
49
50 if len(cmdout[1]) > 0:
51 print "An error occured!\n"
52 print cmdout[1]
53 return refs
54
55 for line in cmdout:
56 regex = re.findall(r'([a-f0-9]+)\s+(\S+)', line)
57 for r in regex:
58 # ref is the key, hash its value.
59 refs[r[1]] = r[0]
60
61 return refs
62
63
64def get_lstree(repo, start, filterlist=[]):
65 '''Get recursive list of tree objects for a given tree.
66 @param repo Path to repository root.
67 @param start Hash identifying the tree.
68 @param filterlist List of paths to retrieve objecs hashes for.
69 An empty list will retrieve all paths.
70 @return Dict mapping filename to blob hash
71 '''
72 output = subprocess.Popen(["git", "ls-tree", "-r", start],
73 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
74 cmdout = output.communicate()
75 objects = {}
76
77 if len(cmdout[1]) > 0:
78 print "An error occured!\n"
79 print cmdout[1]
80 return objects
81
82 for line in cmdout[0].split('\n'):
83 regex = re.findall(r'([0-9]+)\s+([a-z]+)\s+([0-9a-f]+)\s+(\S+)', line)
84 for rf in regex:
85 # filter
86 add = False
87 for f in filterlist:
88 if rf[3].find(f) == 0:
89 add = True
90
91 # If two files have the same content they have the same hash, so
92 # the filename has to be used as key.
93 if len(filterlist) == 0 or add == True:
94 if rf[3] in objects:
95 print "FATAL: key already exists in dict!"
96 return {}
97 objects[rf[3]] = rf[2]
98 return objects
99
100
101def get_object(repo, blob, destfile):
102 '''Get an identified object from the repository.
103 @param repo Path to repository root.
104 @param blob hash for blob to retrieve.
105 @param destfile filename for blob output.
106 @return True if file was successfully written, False on error.
107 '''
108 output = subprocess.Popen(["git", "cat-file", "-p", blob],
109 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
110 cmdout = output.communicate()
111 # make sure output path exists
112 if len(cmdout[1]) > 0:
113 print "An error occured!\n"
114 print cmdout[1]
115 return False
116 if not os.path.exists(os.path.dirname(destfile)):
117 os.makedirs(os.path.dirname(destfile))
118 f = open(destfile, 'w')
119 for line in cmdout[0]:
120 f.write(line)
121 f.close()
122 return True
123
124
125def describe_treehash(repo, treehash):
126 '''Retrieve output of git-describe for a given hash.
127 @param repo Path to repository root.
128 @param treehash Hash identifying the tree / commit to describe.
129 @return Description string.
130 '''
131 output = subprocess.Popen(["git", "describe", treehash],
132 stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo)
133 cmdout = output.communicate()
134 if len(cmdout[1]) > 0:
135 print "An error occured!\n"
136 print cmdout[1]
137 return ""
138 return cmdout[0].rstrip()
139
140
141def scrape_files(repo, treehash, filelist, dest=""):
142 '''Scrape list of files from repository.
143 @param repo Path to repository root.
144 @param treehash Hash identifying the tree.
145 @param filelist List of files to get from repository.
146 @param dest Destination path for files. Files will get retrieved with full
147 path from the repository, and the folder structure will get
148 created below dest as necessary.
149 @return Destination path.
150 '''
151 print "Scraping files from repository"
152
153 if dest == "":
154 dest = tempfile.mkdtemp()
155 treeobjects = get_lstree(repo, treehash, filelist)
156 for obj in treeobjects:
157 get_object(repo, treeobjects[obj], os.path.join(dest, obj))
158
159 return dest
160
161
162def archive_files(repo, treehash, filelist, basename, tmpfolder=""):
163 '''Archive list of files into tarball.
164 @param repo Path to repository root.
165 @param treehash Hash identifying the tree.
166 @param filelist List of files to archive. All files in the archive if left
167 empty.
168 @param basename Basename (including path) of output file. Will get used as
169 basename inside of the archive as well (i.e. no tarbomb).
170 @param tmpfolder Folder to put intermediate files in. If no folder is given
171 a temporary one will get used.
172 @return Output filename.
173 '''
174 print "Archiving files from repository"
175
176 workfolder = scrape_files(repo, treehash, filelist, tmpfolder)
177 outfile = basename + ".tar.bz2"
178 tf = tarfile.open(outfile, "w:bz2")
179 tf.add(workfolder, basename)
180 tf.close()
181 if tmpfolder != workfolder:
182 shutil.rmtree(workfolder)
183 return outfile