-
Notifications
You must be signed in to change notification settings - Fork 1
/
syllabus_paper_compare.py
143 lines (111 loc) · 5.95 KB
/
syllabus_paper_compare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
def RemovePunchuations(text):
# removes all the special characters supplied in
# the variable punchuations from the supplied
# text string
newtext = text
punctuations = '''!()-[]{};:'"\,<>./?@—#$%^&*_~'''
for punchuation in punctuations:
newtext = newtext.replace(punchuation, " ")
return newtext
def TokenfromText(text):
# take a string as input
# return a list of words in the text
newtext = text
newnewtext = newtext.split()
return_list = []
for word in newnewtext:
newword = word.lower()
return_list.append(newword)
return return_list
def FrequencyWithUninterestingRemoved(listofwords):
# takes the tokens from text
# calculates frequency of words supplied in the list
# removes uninteresting words based on corpus supplied
# returns a dictionary of frequency
frequency = {}
supplied_list = listofwords
for word in supplied_list:
if word not in frequency:
frequency[word] = 0
frequency[word] += 1
uninteresting_words = ["the", "a", "to", "if", "is", "it", "of",
"and", "or", "an", "as", "i", "me", "my",
"we", "our", "ours", "you", "your", "yours",
"he", "she", "him", "his", "her", "hers",
"its", "they", "them", "their", "what",
"which", "who", "whom", "this", "that",
"am", "are", "was", "were", "be", "been",
"being", "have", "has", "had", "do",
"does", "did", "but", "at", "by", "with",
"from", "here", "when", "where", "how",
"all", "any", "both", "each", "few", "more",
"some", "such", "no", "nor", "too", "very",
"can", "will", "just", 'a', 'about', 'above',
'after', 'again', 'against', 'all', 'am',
'an', 'and', 'any', 'are', "aren't", 'as',
'at', 'be', 'because', 'been', 'before',
'didst', 'us', 'one', '"', "'", '"i', '”', "’", '“i',
'being', 'below', 'between', 'both',
'but', 'by', "can't", 'cannot', 'could',
"couldn't", 'did', "didn't", 'do', 'does',
"doesn't", 'doing', "don't", 'down', 'during',
'each', 'few', 'for', 'from', 'further', 'had',
"hadn't", 'has', "hasn't", 'have', "haven't",
'having',
'he', "he'd", "he'll", "he's", 'her', 'here', "here's",
'hers', 'herself', 'him', 'himself', 'his', 'how', "how's",
'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is',
"isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more',
'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of',
'off', 'on', 'once', 'only', 'or', 'other', 'ought', 'our',
'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't",
'she', "she'd", "she'll", "she's", 'should', "shouldn't", 'so',
'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs',
'them', 'themselves', 'then', 'there', "there's", 'these', 'they',
"they'd", "they'll", "they're", "they've", 'this', 'those', 'through',
'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we',
"we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's",
'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's",
'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't", 'you',
"you'd", "you'll", "you're", "you've", 'your', "thou", "thee", "thy",
"but", "man", 'yours', 'yourself', 'yourselves']
for uword in uninteresting_words:
if uword in frequency:
del frequency[uword]
return frequency
def Frequency_dict(text):
text1 = RemovePunchuations(text)
listofwords = TokenfromText(text1)
frequencyofwords = FrequencyWithUninterestingRemoved(listofwords)
return frequencyofwords
def Sorted_dict(text):
# Its turning the dict into a list of tuples with .items()
# then sorting using the value (kv[-1]) on each tuple.
# It uses reverse=True to make it go in decreasing order.
# Lastly, it converts that back into a dict and returns it.
tempDict = Frequency_dict(text)
return dict(sorted(tempDict.items(), key=lambda kv: kv[-1], reverse=True))
def textmaker(path):
f = open(path)
text = f.read()
return text
def printformat(input_dict):
for word, num in input_dict.items():
if len(word) > 1 and word.isalpha():
print("{:<15s} : {:>1s}".format(word, str(num)))
def dict_compare(dict1, dict2):
print("WORD : S P ")
for key1, value1 in dict1.items():
for key2, value2 in dict2.items():
if key1 == key2:
#print("Syl : {:<15s} : {:>1s} | Pap : {:<15s} : {:>1s} ".format(
# key1, str(value1), key2, str(value2)))
print("{:<15s} : {} {} ".format(key1,value1,value2))
syllabus_path = "/Users/saurabhkumarsingh/My Files/Offline Only/Study/UPSC_GDrive/Previous Papers/Paper_Data/Syllabus_GS4.txt"
ques_path = "/Users/saurabhkumarsingh/My Files/Offline Only/Study/UPSC_GDrive/Previous Papers/Paper_Data/GS4_2021.txt"
syllabus_text = textmaker(syllabus_path)
questions_text = textmaker(ques_path)
syllabus_dict = Sorted_dict(syllabus_text)
ques_dict = Sorted_dict(questions_text)
dict_compare(syllabus_dict, ques_dict)