-
Notifications
You must be signed in to change notification settings - Fork 0
/
package_statistics.py
136 lines (115 loc) · 3.94 KB
/
package_statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
"""
This script outputs the top Debian packages that have most files associated with them, based
on the contents index available at http://ftp.uk.debian.org/debian/dists/stable/main/.
"""
__author__ = "Francisco Carrola"
__version__ = "1.0"
import argparse
import gzip
import os
from typing import Dict, List
import sys
import re
import requests
ARCH_CHOICES = [
"amd64",
"arm64",
"armel",
"armhf",
"i386",
"mips64el",
"mipsel",
"ppc64el",
"s390x",
]
CONTENT_INDICE_MIRROR = "http://ftp.uk.debian.org/debian/dists/stable/main/"
def parse_arguments():
"""
Parse command-line arguments.
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"arch", nargs="?", choices=ARCH_CHOICES, default="amd64", help="Architecture"
)
return parser.parse_args()
class ContentIndice:
"""
Class to hold the content inside contents index file.
:param arch: architecture
:type arch: str
"""
FILE_NAME = "Contents-{}.gz"
def __init__(self, arch: str) -> None:
self.file_name = ContentIndice.FILE_NAME.format(arch)
self.url = CONTENT_INDICE_MIRROR + self.file_name
self.parsed_content: Dict[str, List[str]] = {}
def get_file(self) -> None:
"""
Method to get the contents index file.
"""
# Check if file already exists. If so, delete it
if os.path.exists(self.file_name):
os.remove(self.file_name)
# Download file
file_content = requests.get(self.url, stream=True).content
with open(self.file_name, "wb") as out_file:
out_file.write(file_content)
def parse_file(self) -> None:
"""
Method to parse the contents index file and organize its content in a dictionary that
holds all packages and the filenames associated with each of them.
"""
with gzip.open(self.file_name, "rt") as file:
lines = file.readlines()
for line in lines:
parsed_line = re.split(r"\s{1,}", line)
packages_file = parsed_line[0]
packages_list = parsed_line[1].split(",")
# Iterate through all the packages that have a certain file
for package in packages_list:
# If the package name key already exists, just append the filename
if package in self.parsed_content:
self.parsed_content[package].append(packages_file)
# If not, create a new key
else:
self.parsed_content[package] = [packages_file]
def get_top_packages(self, number: int) -> None:
"""
Method that retrieves the top packages with the most files associated with them.
:param number: number of packages to show in the top list
:type number: int
"""
packages_list: List[str] = []
occurrence_list: List[int] = []
for key, value in self.parsed_content.items():
# Populate auxiliary lists
packages_list.append(key)
occurrence_list.append(len(value))
for i in range(number):
# Get index of elements with most occurrences
index = occurrence_list.index(max(occurrence_list))
# Print the package info based on the index
print(
"{}. {} -> {}".format(
i + 1, packages_list[index], occurrence_list[index]
)
)
# Remove elements from auxiliary lists
occurrence_list.pop(index)
packages_list.pop(index)
def main():
"""
Entry point.
"""
args = parse_arguments()
try:
content = ContentIndice(args.arch)
content.get_file()
content.parse_file()
content.get_top_packages(10)
except (IOError, ValueError) as err:
print(err)
sys.exit(1)
if __name__ == "__main__":
main()