diff options
Diffstat (limited to 'utils')
-rwxr-xr-x | utils/common/gitscraper.py | 183 |
1 files changed, 183 insertions, 0 deletions
diff --git a/utils/common/gitscraper.py b/utils/common/gitscraper.py new file mode 100755 index 0000000000..49ef42de13 --- /dev/null +++ b/utils/common/gitscraper.py | |||
@@ -0,0 +1,183 @@ | |||
1 | #!/usr/bin/python | ||
2 | # __________ __ ___. | ||
3 | # Open \______ \ ____ ____ | | _\_ |__ _______ ___ | ||
4 | # Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / | ||
5 | # Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < | ||
6 | # Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ | ||
7 | # \/ \/ \/ \/ \/ | ||
8 | # | ||
9 | # Copyright (c) 2012 Dominik Riebeling | ||
10 | # | ||
11 | # All files in this archive are subject to the GNU General Public License. | ||
12 | # See the file COPYING in the source tree root for full license agreement. | ||
13 | # | ||
14 | # This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY | ||
15 | # KIND, either express or implied. | ||
16 | # | ||
17 | |||
18 | '''Scrape files from a git repository. | ||
19 | |||
20 | This module provides functions to get a subset of files from a git repository. | ||
21 | The files to retrieve can be specified, and the git tree to work on can be | ||
22 | specified. That was arbitrary trees can be retrieved (like a subset of files | ||
23 | for a given tag). | ||
24 | |||
25 | Retrieved files can be packaged into a bzip2 compressed tarball or stored in a | ||
26 | given folder for processing afterwards. | ||
27 | |||
28 | Calls git commands directly for maximum compatibility. | ||
29 | ''' | ||
30 | |||
31 | import re | ||
32 | import subprocess | ||
33 | import os | ||
34 | import tarfile | ||
35 | import tempfile | ||
36 | import shutil | ||
37 | |||
38 | |||
39 | def get_refs(repo): | ||
40 | '''Get dict matching refs to hashes from repository pointed to by repo. | ||
41 | @param repo Path to repository root. | ||
42 | @return Dict matching hashes to each ref. | ||
43 | ''' | ||
44 | print "Getting list of refs" | ||
45 | output = subprocess.Popen(["git", "show-ref"], stdout=subprocess.PIPE, | ||
46 | stderr=subprocess.PIPE, cwd=repo) | ||
47 | cmdout = output.communicate() | ||
48 | refs = {} | ||
49 | |||
50 | if len(cmdout[1]) > 0: | ||
51 | print "An error occured!\n" | ||
52 | print cmdout[1] | ||
53 | return refs | ||
54 | |||
55 | for line in cmdout: | ||
56 | regex = re.findall(r'([a-f0-9]+)\s+(\S+)', line) | ||
57 | for r in regex: | ||
58 | # ref is the key, hash its value. | ||
59 | refs[r[1]] = r[0] | ||
60 | |||
61 | return refs | ||
62 | |||
63 | |||
64 | def get_lstree(repo, start, filterlist=[]): | ||
65 | '''Get recursive list of tree objects for a given tree. | ||
66 | @param repo Path to repository root. | ||
67 | @param start Hash identifying the tree. | ||
68 | @param filterlist List of paths to retrieve objecs hashes for. | ||
69 | An empty list will retrieve all paths. | ||
70 | @return Dict mapping filename to blob hash | ||
71 | ''' | ||
72 | output = subprocess.Popen(["git", "ls-tree", "-r", start], | ||
73 | stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo) | ||
74 | cmdout = output.communicate() | ||
75 | objects = {} | ||
76 | |||
77 | if len(cmdout[1]) > 0: | ||
78 | print "An error occured!\n" | ||
79 | print cmdout[1] | ||
80 | return objects | ||
81 | |||
82 | for line in cmdout[0].split('\n'): | ||
83 | regex = re.findall(r'([0-9]+)\s+([a-z]+)\s+([0-9a-f]+)\s+(\S+)', line) | ||
84 | for rf in regex: | ||
85 | # filter | ||
86 | add = False | ||
87 | for f in filterlist: | ||
88 | if rf[3].find(f) == 0: | ||
89 | add = True | ||
90 | |||
91 | # If two files have the same content they have the same hash, so | ||
92 | # the filename has to be used as key. | ||
93 | if len(filterlist) == 0 or add == True: | ||
94 | if rf[3] in objects: | ||
95 | print "FATAL: key already exists in dict!" | ||
96 | return {} | ||
97 | objects[rf[3]] = rf[2] | ||
98 | return objects | ||
99 | |||
100 | |||
101 | def get_object(repo, blob, destfile): | ||
102 | '''Get an identified object from the repository. | ||
103 | @param repo Path to repository root. | ||
104 | @param blob hash for blob to retrieve. | ||
105 | @param destfile filename for blob output. | ||
106 | @return True if file was successfully written, False on error. | ||
107 | ''' | ||
108 | output = subprocess.Popen(["git", "cat-file", "-p", blob], | ||
109 | stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo) | ||
110 | cmdout = output.communicate() | ||
111 | # make sure output path exists | ||
112 | if len(cmdout[1]) > 0: | ||
113 | print "An error occured!\n" | ||
114 | print cmdout[1] | ||
115 | return False | ||
116 | if not os.path.exists(os.path.dirname(destfile)): | ||
117 | os.makedirs(os.path.dirname(destfile)) | ||
118 | f = open(destfile, 'w') | ||
119 | for line in cmdout[0]: | ||
120 | f.write(line) | ||
121 | f.close() | ||
122 | return True | ||
123 | |||
124 | |||
125 | def describe_treehash(repo, treehash): | ||
126 | '''Retrieve output of git-describe for a given hash. | ||
127 | @param repo Path to repository root. | ||
128 | @param treehash Hash identifying the tree / commit to describe. | ||
129 | @return Description string. | ||
130 | ''' | ||
131 | output = subprocess.Popen(["git", "describe", treehash], | ||
132 | stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo) | ||
133 | cmdout = output.communicate() | ||
134 | if len(cmdout[1]) > 0: | ||
135 | print "An error occured!\n" | ||
136 | print cmdout[1] | ||
137 | return "" | ||
138 | return cmdout[0].rstrip() | ||
139 | |||
140 | |||
141 | def scrape_files(repo, treehash, filelist, dest=""): | ||
142 | '''Scrape list of files from repository. | ||
143 | @param repo Path to repository root. | ||
144 | @param treehash Hash identifying the tree. | ||
145 | @param filelist List of files to get from repository. | ||
146 | @param dest Destination path for files. Files will get retrieved with full | ||
147 | path from the repository, and the folder structure will get | ||
148 | created below dest as necessary. | ||
149 | @return Destination path. | ||
150 | ''' | ||
151 | print "Scraping files from repository" | ||
152 | |||
153 | if dest == "": | ||
154 | dest = tempfile.mkdtemp() | ||
155 | treeobjects = get_lstree(repo, treehash, filelist) | ||
156 | for obj in treeobjects: | ||
157 | get_object(repo, treeobjects[obj], os.path.join(dest, obj)) | ||
158 | |||
159 | return dest | ||
160 | |||
161 | |||
162 | def archive_files(repo, treehash, filelist, basename, tmpfolder=""): | ||
163 | '''Archive list of files into tarball. | ||
164 | @param repo Path to repository root. | ||
165 | @param treehash Hash identifying the tree. | ||
166 | @param filelist List of files to archive. All files in the archive if left | ||
167 | empty. | ||
168 | @param basename Basename (including path) of output file. Will get used as | ||
169 | basename inside of the archive as well (i.e. no tarbomb). | ||
170 | @param tmpfolder Folder to put intermediate files in. If no folder is given | ||
171 | a temporary one will get used. | ||
172 | @return Output filename. | ||
173 | ''' | ||
174 | print "Archiving files from repository" | ||
175 | |||
176 | workfolder = scrape_files(repo, treehash, filelist, tmpfolder) | ||
177 | outfile = basename + ".tar.bz2" | ||
178 | tf = tarfile.open(outfile, "w:bz2") | ||
179 | tf.add(workfolder, basename) | ||
180 | tf.close() | ||
181 | if tmpfolder != workfolder: | ||
182 | shutil.rmtree(workfolder) | ||
183 | return outfile | ||