diff --git a/.travis.yml b/.travis.yml index 6a9fa43b..3e60798b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,7 @@ python: - 3.5 - 3.6 - 3.7 + - 3.8 before_install: - pip install --upgrade pip @@ -23,3 +24,6 @@ script: - atarashi -a tfidf -s CosineSim ./atarashi/atarashii.py - atarashi -a DLD ./atarashi/atarashii.py - atarashi -a wordFrequencySimilarity ./atarashi/atarashii.py + - atarashi -a lr_classifier ./atarashi/atarashii.py + - atarashi -a svc_classifier ./atarashi/atarashii.py + - atarashi -a nb_classifier ./atarashi/atarashii.py diff --git a/MANIFEST.in b/MANIFEST.in index 0957c605..1747d2c3 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -11,6 +11,7 @@ include requirements*.* include pyproject.toml include atarashi/data/licenses/processedLicenses.csv include atarashi/data/Ngram_keywords.json +include atarashi/data/models/* prune .git prune venv diff --git a/README.md b/README.md index ba42075c..51cd9628 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,22 @@ Get the help by running `atarashi -h` or `atarashi --help` - With **Bigram Cosine similarity** `atarashi -a Ngram -s BigramCosineSim /path/to/file.c` +- **Classification models** + - **Training** (optional) + + `python3 atarashi/agents/models/train.py` + - Running **Classification Models** + + + - **Logistic Regression** + + `atarashi -a lr_classifier /path/to/file.c` + - **Multimomial Naive Bayes** + + `atarashi -a nb_classifier /path/to/file.c` + - **Linear SVC** + + `atarashi -a svc_classifier /path/to/file.c` - Running in **verbose** mode `atarashi -a DLD -v /path/to/file.c` @@ -89,6 +105,14 @@ Get the help by running `atarashi -h` or `atarashi --help` understandable by atarashi. - `atarashi -a DLD -l /path/to/processedList.csv /path/to/file.c` - `atarashi -a Ngram -l /path/to/processedList.csv -j /path/to/ngram.json /path/to/file.c` +- Running with a custom folder containing all the binary files + - We have introduced a `-m` /`--models` input parameter for providing location for model folder containing all the binary files required. + E.g. + + ` atarashi -m path/to/custom/model/folder/ -a classifier_name /path/to/file.c` + + *Note: Providing this parameter is not compulsory, the default folder with binary files will load instead.* + ### Running Docker image 1. Pull Docker image diff --git a/atarashi/agents/models/__init__.py b/atarashi/agents/models/__init__.py new file mode 100644 index 00000000..b25aec62 --- /dev/null +++ b/atarashi/agents/models/__init__.py @@ -0,0 +1,4 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from .test import Model as Model diff --git a/atarashi/agents/models/test.py b/atarashi/agents/models/test.py new file mode 100644 index 00000000..b08109a9 --- /dev/null +++ b/atarashi/agents/models/test.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Copyright 2018 Kaushlendra Pratap (kaushlendrapratap.9837@gmail.com) + +SPDX-License-Identifier: GPL-2.0 + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +version 2 as published by the Free Software Foundation. +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +""" + +import joblib +import os +import argparse +from atarashi.agents.atarashiAgent import AtarashiAgent +from atarashi.libs.initialmatch import spdx_identifer + + +class Model(AtarashiAgent): + + ''' + Class Model Inherits the Atarashi Agent class inorder to follow a linear and similar interface. + Few Methods of parent class are required in Model class. + + :Inherits: Atarashi Agent + :Inherited_Method_1(__init__): Parent class constructor to verify the provided licenseList + :Inherited_Method_2(loadFile): Extracting the license text from the source code and returning a pre-processed comment text. + + :Derived Class: Model + :Method_1(__init__): Initialising absolute path of the models directory + :Method_2(similarity_calc): Classifying the license name from the input processed comment. + :Method_3(model_predict): Returning a list containing respective metadata. + :Method_4(getSimalgo): Getter method + :Method_5(setSimAlgo): Setter method for assigning the algorithm to use. + :Method_6(scan): Acts as a control method which allows to move forward when everything asked for is there. + + ''' + + lr_classifier = "lr_classifier" + nb_classifier = "nb_classifier" + svc_classifier = "svc_classifier" + + def __init__(self, licenseList, modelsLoc): + super().__init__(licenseList) + self.models_folder = os.path.abspath(modelsLoc) + + def similarity_calc(self, processed_comment): + + ''' + The function is designed to give the prediction results of the specific model + asked by the user. Implementation of all three models and their binary files + is done here. + + :param processed_comment: Pre-processed string derived from the input extracted license. + :return: A list containing the predicted license name by the specific model. + :rtype: list() + + ''' + + with open(os.path.join(self.models_folder, 'vectorizer.pkl'), 'rb') as f: + loaded_vect = joblib.load(f) + + if self.getSimAlgo() == self.lr_classifier: + classifier = joblib.load(os.path.join(self.models_folder, 'lr_model.pkl')) + elif self.getSimAlgo() == self.nb_classifier: + classifier = joblib.load(os.path.join(self.models_folder, 'nb_model.pkl')) + elif self.getSimAlgo() == self.svc_classifier: + classifier = joblib.load(os.path.join(self.models_folder, 'svc_model.pkl')) + + return classifier.predict((loaded_vect.transform([processed_comment]))) + + + def model_predict(self, filePath): + + ''' + The function is designed to give output as the most similar predicted files + provided by the user. Three different model approaches are designed + which can result into different similarities. The comments from files are + extracted and then the prediction is done on the basis of pre-trained + models in data folder. + + :param filePath: Input file path to scan + :return: Result with license shortname, sim_score, sim_type and description + :rtype: list(JSON Format) + ''' + + match = [] + + with open(filePath) as file: + raw_data = file.read() + + # Match SPDX identifiers + spdx_identifiers = spdx_identifer(raw_data, self.licenseList['shortname']) + match.extend(spdx_identifiers) + + processed_comment = super().loadFile(filePath) + license_name = self.similarity_calc(processed_comment) + + match.append({ + 'shortname': str(license_name[0]), + 'sim_score': 1, + 'sim_type': self.getSimAlgo(), + 'description': "Shortname: is the predicted license by the model" + }) + return match + + def getSimAlgo(self): + return self.algo + + def setSimAlgo(self, newAlgo): + if newAlgo in (Model.lr_classifier, Model.nb_classifier, Model.svc_classifier): + self.algo = newAlgo + + def scan(self, filePath): + if self.algo in (Model.lr_classifier, Model.nb_classifier, Model.svc_classifier): + return self.model_predict(filePath) + else: + return -1 + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("processedLicenseList", help="Specify the processed license list file") + parser.add_argument("modelFolder", help="Specify the location of folder with models") + parser.add_argument("inputFile", help="Specify the input file which needs to be scanned") + parser.add_argument("-m","--modelname",default="lr_classifier",choices=["lr_classifier","nb_classifier","svc_classifier"], help = "Specify the model name") + args = parser.parse_args() + + licenseList = args.processedLicenseList + filename = args.inputFile + model = args.modelname + modelFolder = args.modelFolder + + scanner = Model(licenseList, modelFolder) + scanner.setSimAlgo(model) + scanner.scan(filename) diff --git a/atarashi/agents/models/train.py b/atarashi/agents/models/train.py new file mode 100644 index 00000000..ea7413aa --- /dev/null +++ b/atarashi/agents/models/train.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Copyright 2018 Kaushlendra Pratap (kaushlendrapratap.9837@gmail.com) + +SPDX-License-Identifier: GPL-2.0 + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +version 2 as published by the Free Software Foundation. +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +""" + +import pandas as pd +import os +import joblib +from atarashi.libs.commentPreprocessor import CommentPreprocessor +from sklearn.svm import LinearSVC +from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer +from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import MultinomialNB + + + +def model_train(): + + ''' + This function is a very versatile function which starts from loading the Pandas Dataframe + and applying the pre-defined preprocessing technique. It also generates a vocabulary of words + for each license text. Initialisation of all three models followed by the training of each + model on the provided training dataset. Finally, it stores the binary file into models + folder for quick classification in future. + + ''' + + current_dir = os.path.dirname(os.path.abspath(__file__)) + data_dir = os.path.abspath(os.path.join(current_dir,os.path.join(os.pardir,os.pardir))) + + licensepath = os.path.join(data_dir, "data/licenses/licenseList.csv") + binary1 = os.path.join(data_dir, 'data/models/lr_model.pkl') + binary2 = os.path.join(data_dir, 'data/models/nb_model.pkl') + binary3 = os.path.join(data_dir, 'data/models/svc_model.pkl') + binary4 = os.path.join(data_dir, 'data/models/vectorizer.pkl') + + data = pd.read_csv(licensepath) + data.drop(['parent_shortname', 'report_shortname', 'url', 'notes', 'source', 'risk','fullname'], axis = 1, inplace = True) + data.dropna(inplace=True) + data['text'] = data['text'].astype(str) + data['cleaned'] = data['text'].apply(CommentPreprocessor.preprocess) + + X_train, y_train = data['cleaned'],data['shortname'] + count_vect = CountVectorizer() + X_train_counts = count_vect.fit_transform(X_train) + tfidf_transformer = TfidfTransformer() + X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) + + ##Initialisation of Models and creating + naive_bayes = MultinomialNB() + l_regress = LogisticRegression() + svc_classifier = LinearSVC() + + print("Model training is going on") + naive_bayes.fit(X_train_tfidf,y_train) + print("First training completed") + l_regress.fit(X_train_tfidf,y_train) + print("Second training completed") + svc_classifier.fit(X_train_tfidf,y_train) + print("Third training completed") + + print("All the models have been trained perfectly!!") + print("Saving the models into data folder....") + joblib.dump(naive_bayes,binary2) + joblib.dump(l_regress,binary1) + joblib.dump(svc_classifier,binary3) + joblib.dump(count_vect,binary4) + print("Done") + + + +if __name__ == "__main__": + model_train() diff --git a/atarashi/atarashii.py b/atarashi/atarashii.py index ce233ba2..5f7c6ee2 100644 --- a/atarashi/atarashii.py +++ b/atarashi/atarashii.py @@ -27,19 +27,23 @@ from atarashi.agents.dameruLevenDist import DameruLevenDist from atarashi.agents.tfidf import TFIDF from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity +from atarashi.agents.models import Model __author__ = "Aman Jain" __email__ = "amanjain5221@gmail.com" __version__ = "0.0.10" -def atarashii_runner(inputFile, processedLicense, agent_name, similarity="CosineSim", ngramJsonLoc=None, verbose=None): +def atarashii_runner(inputFile, processedLicense, agent_name, + similarity="CosineSim", ngramJsonLoc=None, modelsLoc=None, + verbose=None): ''' :param inputFile: Input File for scanning of license :param processedLicense: Processed License List (CSV) path (Default path already provided) :param agent_name: Specify the agent that you want to use for scanning :param similarity: Specify the similarity type to be used for the particular agent :param ngramJsonLoc: Specify N-Gram Json File location + :param modelsLoc: Specify folder location of trained models :param verbose: Specify if verbose mode is on or not (Default is Off/ None) :return: Returns the array of JSON with scan results @@ -56,6 +60,9 @@ def atarashii_runner(inputFile, processedLicense, agent_name, similarity="Cosine scanner = "" if agent_name == "wordFrequencySimilarity": scanner = WordFrequencySimilarity(processedLicense) + elif agent_name in ("lr_classifier", "svc_classifier", "nb_classifier"): + scanner = Model(processedLicense, modelsLoc) + scanner.setSimAlgo(agent_name) elif agent_name == "DLD": scanner = DameruLevenDist(processedLicense) elif agent_name == "tfidf": @@ -91,12 +98,13 @@ def main(): ''' defaultProcessed = resource_filename("atarashi", "data/licenses/processedLicenses.csv") defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json") + defaultModels = os.path.dirname(resource_filename("atarashi", "data/models/vectorizer.pkl")) parser = argparse.ArgumentParser() parser.add_argument("inputFile", help="Specify the input file path to scan") parser.add_argument("-l", "--processedLicenseList", required=False, help="Specify the location of processed license list file") parser.add_argument("-a", "--agent_name", required=True, - choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'], + choices=['wordFrequencySimilarity','lr_classifier','svc_classifier','nb_classifier' ,'DLD', 'tfidf', 'Ngram'], help="Name of the agent that needs to be run") parser.add_argument("-s", "--similarity", required=False, default="CosineSim", choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"], @@ -104,6 +112,9 @@ def main(): " First 2 are for TFIDF and last 3 are for Ngram") parser.add_argument("-j", "--ngram_json", required=False, help="Specify the location of Ngram JSON (for Ngram agent only)") + parser.add_argument("-m", "--models", required=False, + help="Specify the location of models folder (for " + "classifier agents only)", default=defaultModels) parser.add_argument("-v", "--verbose", help="increase output verbosity", action="count", default=0) parser.add_argument('-V', '--version', action='version', version='%(prog)s ' + __version__) @@ -114,13 +125,17 @@ def main(): verbose = args.verbose processedLicense = args.processedLicenseList ngram_json = args.ngram_json + models = args.models if processedLicense is None: processedLicense = defaultProcessed if ngram_json is None: ngram_json = defaultJSON + if models is None: + models = defaultModels - result = atarashii_runner(inputFile, processedLicense, agent_name, similarity, ngram_json, verbose) + result = atarashii_runner(inputFile, processedLicense, agent_name, similarity, + ngram_json, models, verbose) if agent_name == "wordFrequencySimilarity": result = [{ "shortname": str(result), diff --git a/atarashi/data/models/lr_model.pkl b/atarashi/data/models/lr_model.pkl new file mode 100644 index 00000000..52374279 Binary files /dev/null and b/atarashi/data/models/lr_model.pkl differ diff --git a/atarashi/data/models/nb_model.pkl b/atarashi/data/models/nb_model.pkl new file mode 100644 index 00000000..d94e5307 Binary files /dev/null and b/atarashi/data/models/nb_model.pkl differ diff --git a/atarashi/data/models/svc_model.pkl b/atarashi/data/models/svc_model.pkl new file mode 100644 index 00000000..d5e89853 Binary files /dev/null and b/atarashi/data/models/svc_model.pkl differ diff --git a/atarashi/data/models/vectorizer.pkl b/atarashi/data/models/vectorizer.pkl new file mode 100644 index 00000000..63955f16 Binary files /dev/null and b/atarashi/data/models/vectorizer.pkl differ diff --git a/atarashi/evaluator/evaluator.py b/atarashi/evaluator/evaluator.py index 77f5ab03..a22e8462 100644 --- a/atarashi/evaluator/evaluator.py +++ b/atarashi/evaluator/evaluator.py @@ -51,6 +51,12 @@ def getCommand(agent_name, similarity): command = "atarashi -a wordFrequencySimilarity" elif agent_name == "DLD": command = "atarashi -a DLD" + elif agent_name == "lr_classifier": + command = "atarashi -a lr_classifier" + elif agent_name == "nb_classifier": + command = "atarashi -a nb_classifier" + elif agent_name == "svc_classifier": + command = "atarashi -a svc_classifier" elif agent_name == "tfidf": command = "atarashi -a tfidf" if similarity == "CosineSim": @@ -129,9 +135,9 @@ def evaluate(command): if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-a", "--agent_name", required=True, - choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'], help="Name of the agent that you want to evaluate") + choices=['wordFrequencySimilarity', 'DLD',"lr_classifier","svc_classifier","nb_classifier", 'tfidf', 'Ngram'], help="Name of the agent that you want to evaluate") parser.add_argument("-s", "--similarity", required=False, - default=" ", choices=["ScoreSim", "CosineSim", "DiceSim", " ", "BigramCosineSim"], help="Specify the similarity algorithm that you want to evaluate" + default=" ", choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"], help="Specify the similarity algorithm that you want to evaluate" " First 2 are for TFIDF and last 3 are for Ngram") args = parser.parse_args() agent_name = args.agent_name diff --git a/atarashi/license/licenseLoader.py b/atarashi/license/licenseLoader.py index 81fb4825..755e959c 100644 --- a/atarashi/license/licenseLoader.py +++ b/atarashi/license/licenseLoader.py @@ -32,7 +32,7 @@ class LicenseLoader(object): def fetch_licenses(licenseList): # common ''' :param licenseList: Path to license list (CSV) - :return: Return the CSV contents as padnas.DataFrame + :return: Return the CSV contents as pandas.DataFrame ''' licenseDataFrame = pd.read_csv(licenseList) licenseDataFrame = licenseDataFrame.replace(np.nan, '', regex = True) diff --git a/setup.py b/setup.py index d41f85fd..5f216197 100755 --- a/setup.py +++ b/setup.py @@ -144,7 +144,11 @@ def run(self): package_data = { 'atarashi': [ 'data/Ngram_keywords.json', - 'data/licenses/processedLicenses.csv' + 'data/licenses/processedLicenses.csv', + 'data/models/lr_model.pkl', + 'data/models/nb_model.pkl', + 'data/models/svc_model.pkl', + 'data/models/vectorizer.pkl' ] }, cmdclass = {