Merge pull request #8 from fossology/HastagAB/Developement

Add Text File Support and Bug Fix
fossology · Jul 26, 2020 · 37ad2c2 · 37ad2c2
2 parents f9d6374 + 8818e80
commit 37ad2c2
Show file tree

Hide file tree

Showing 30 changed files with 474 additions and 154 deletions.
diff --git a/extractor/__init__.py b/extractor/__init__.py
@@ -0,0 +1,6 @@
+from extractor.main import *
+
+def extract(file):
+    return file_runner(file)
+
+__all__ = ['file_runner','extract', 'langIdentifier']
diff --git a/extractor/binder.py b/extractor/binder.py
@@ -43,6 +43,8 @@ def readSingleLine(file, regex, sign):
             if line:
                 if line[0] == sign:
                     line_of_comments += 1
+                elif line[0:2] == sign:
+                    line_of_comments += 1
 
             if not line.strip():
                 blank_lines += 1
@@ -114,10 +116,11 @@ def readMultiLineDiff(file, startSyntax: str, endSyntax: str):
                 content = ""
                 endLine.append(lineNumber)
             if copy:
-                line_of_comments += 1
                 content = content + line.replace('\n',' ')
             if not line.strip():
                 blank_lines += 1
+        for idx, i in enumerate(endLine):
+            line_of_comments = line_of_comments + (endLine[idx]-startLine[idx]) + 1
         line_of_comments += len(output)
         output = [s.strip(startSyntax) for s in output]
         output = [s.strip(endSyntax) for s in output]

diff --git a/extractor/languages/__init__.py b/extractor/languages/__init__.py
@@ -1 +1 @@
-__all__ = ["c", "c_sharp", "cpp", "css", "go", "haskell", "html", "java", "javascript", "kotlin", "matlab", "perl", "php", "python", "r", "ruby", "rust", "scala", "shell", "swift"]
+__all__ = ["c", "c_sharp", "cpp", "css", "go", "haskell", "html", "java", "javascript", "kotlin", "matlab", "perl", "php", "python", "r", "ruby", "rust", "scala", "shell", "swift", "text"]
diff --git a/extractor/languages/c.py b/extractor/languages/c.py
@@ -54,8 +54,11 @@ def cExtractor(file):
             output['cont_single_line_comment'].append({"start_line": result4[1][idx], "end_line": result4[2][idx], "comment": result4[3][idx]})
 
     if result2:
-        for idx,i in enumerate(result2[0]):
-            output['multi_line_comment'].append({"start_line": result2[0][idx], "end_line": result2[1][idx], "comment": result2[2][idx]})
+        try:
+            for idx,i in enumerate(result2[0]):
+                output['multi_line_comment'].append({"start_line": result2[0][idx], "end_line": result2[1][idx], "comment": result2[2][idx]})
+        except:
+            pass
 
     return output
 

diff --git a/extractor/languages/css.py b/extractor/languages/css.py
@@ -40,8 +40,11 @@ def cssExtractor(file):
         "multi_line_comment": []
     }
     if result1:
-        for idx,i in enumerate(result1[0]):
-            output['multi_line_comment'].append({"start_line": result1[0][idx], "end_line": result1[1][idx], "comment": result1[2][idx]})
+        try:
+            for idx,i in enumerate(result1[0]):
+                output['multi_line_comment'].append({"start_line": result1[0][idx], "end_line": result1[1][idx], "comment": result1[2][idx]})
+        except:
+            pass
     return output
 
 

diff --git a/extractor/languages/html.py b/extractor/languages/html.py
@@ -26,7 +26,6 @@ def htmlExtractor(file):
     result = CommentSyntax()
     result1 = result.gtExclamationDash(file)
     result2 = result.slashStar(file)
-    result4 = contSingleLines(result1)
     file = file.split("/")
     output = {
         "metadata": [{
@@ -42,22 +41,21 @@ def htmlExtractor(file):
         "multi_line_comment": []
     }
 
-    if result4:
-        result1 = result4[0]
 
     if result1:
-        for i in result1[0]:
-            output['single_line_comment'].append({"line_number" :i[0],"comment": i[1]})
-
-    if result4:
-        for idx,i in enumerate(result4[1]):
-            output['cont_single_line_comment'].append({"start_line": result4[1][idx], "end_line": result4[2][idx], "comment": result4[3][idx]})
-
+        try:
+            for idx,i in enumerate(result1[0]):
+                output['multi_line_comment'].append({"start_line": result1[0][idx], "end_line": result1[1][idx], "comment": result1[2][idx]})
+        except:
+            pass
+
     if result2:
-        for idx,i in enumerate(result2[0]):
-            output['multi_line_comment'].append({"start_line": result2[0][idx], "end_line": result2[1][idx], "comment": result2[2][idx]})
+        try:
+            for idx,i in enumerate(result2[0]):
+                output['multi_line_comment'].append({"start_line": result2[0][idx], "end_line": result2[1][idx], "comment": result2[2][idx]})
+        except:
+            pass
 
-
     return output
 
 

diff --git a/extractor/languages/javascript.py b/extractor/languages/javascript.py
@@ -54,8 +54,11 @@ def javascriptExtractor(file):
             output['cont_single_line_comment'].append({"start_line": result4[1][idx], "end_line": result4[2][idx], "comment": result4[3][idx]})
 
     if result2:
-        for idx,i in enumerate(result2[0]):
-            output['multi_line_comment'].append({"start_line": result2[0][idx], "end_line": result2[1][idx], "comment": result2[2][idx]})
+        try:
+            for idx,i in enumerate(result2[0]):
+                output['multi_line_comment'].append({"start_line": result2[0][idx], "end_line": result2[1][idx], "comment": result2[2][idx]})
+        except:
+            pass
 
 
     return output

diff --git a/extractor/languages/tests/test_c.py b/extractor/languages/tests/test_c.py
@@ -1,20 +1,22 @@
 import unittest
 import re, os
 from languages import c
-from binder import readSingleLine,readMultiLineDiff
+from binder import readSingleLine,readMultiLineDiff,contSingleLines
 
 class CTest(unittest.TestCase):
 
     def test_output(self):
         path = os.path.join(os.getcwd(),"languages/tests/TestFiles/textcomment.c")
         regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
+        sign = '//'
         self.syntax_start = "/*"
         self.syntax_end ='*/'
-        comment_single = c.readSingleLine(path,regex)
+        comment_single = c.readSingleLine(path,regex,sign)
         comment_multiline = c.readMultiLineDiff(path,self.syntax_start,self.syntax_end)
-
+        comment_contSinglelines = c.contSingleLines(comment_single)
         self.assertTrue(comment_single)
         self.assertTrue(comment_multiline)
+        self.assertTrue(comment_contSinglelines)
 
 
 
@@ -23,9 +25,11 @@ def test_outputFormat(self):
         regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
         self.syntax_start = "/*"
         self.syntax_end = "*/"
+        sign = '//'
         expected = c.cExtractor(path)
-        comment_single = readSingleLine(path,regex)
+        comment_single = readSingleLine(path,regex,sign)
         comment_multiline = readMultiLineDiff(path,self.syntax_start,self.syntax_end)
+        comment_contSinglelines = contSingleLines(comment_single)
         file = path.split("/")
         output = {
         "metadata": [{
@@ -37,16 +41,27 @@ def test_outputFormat(self):
         "sloc": comment_single[1]-(comment_single[3]+comment_multiline[3]+comment_single[2])
         }],
         "single_line_comment": [],
+        "cont_single_line_comment": [],
         "multi_line_comment": []
-    }
+        }
+
+        if comment_contSinglelines:
+            comment_single = comment_contSinglelines[0]
+
         if comment_single:
             for i in comment_single[0]:
                 output['single_line_comment'].append({"line_number" :i[0],"comment": i[1]})
 
-        if comment_multiline:
-            for idx,i in enumerate(comment_multiline[0]):
-                output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})
+        if comment_contSinglelines:
+            for idx,i in enumerate(comment_contSinglelines[1]):
+                output['cont_single_line_comment'].append({"start_line": comment_contSinglelines[1][idx], "end_line": comment_contSinglelines[2][idx], "comment": comment_contSinglelines[3][idx]})
 
+        if comment_multiline:
+            try:
+                for idx,i in enumerate(comment_multiline[0]):
+                    output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})
+            except:
+                pass
         self.assertEqual(output,expected)  
 
     def test_Source(self):

diff --git a/extractor/languages/tests/test_c_sharp.py b/extractor/languages/tests/test_c_sharp.py
@@ -1,7 +1,7 @@
 import unittest
 import re, os
 from languages import c_sharp
-from binder import readSingleLine,readMultiLineDiff
+from binder import readSingleLine,readMultiLineDiff,contSingleLines
 
 class CSharpTest(unittest.TestCase):
 
@@ -10,11 +10,14 @@ def test_output(self):
         regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
         self.syntax_start = "/*"
         self.syntax_end ='*/'
-        comment_single = c_sharp.readSingleLine(path,regex)
+        sign = '//'
+        comment_single = c_sharp.readSingleLine(path,regex,sign)
         comment_multiline = c_sharp.readMultiLineDiff(path,self.syntax_start,self.syntax_end)
+        comment_contSingleline = c_sharp.contSingleLines(comment_single)
 
         self.assertTrue(comment_single)
         self.assertTrue(comment_multiline)
+        self.assertTrue(comment_contSingleline)
 
 
 
@@ -23,9 +26,11 @@ def test_outputFormat(self):
         regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
         self.syntax_start = "/*"
         self.syntax_end ='*/'
+        sign = '//'
         expected = c_sharp.c_sharpExtractor(path)
-        comment_single = readSingleLine(path,regex)
+        comment_single = readSingleLine(path,regex,sign)
         comment_multiline = readMultiLineDiff(path,self.syntax_start,self.syntax_end)
+        comment_contSingleline = contSingleLines(comment_single)
         file = path.split("/")
         output = {
         "metadata": [{
@@ -37,12 +42,21 @@ def test_outputFormat(self):
         "sloc": comment_single[1]-(comment_single[3]+comment_multiline[3]+comment_single[2])
         }],
         "single_line_comment": [],
+        "cont_single_line_comment": [],
         "multi_line_comment": []
-    }
+        }
+
+        if comment_contSingleline:
+            comment_single = comment_contSingleline[0]
+
         if comment_single:
             for i in comment_single[0]:
                 output['single_line_comment'].append({"line_number" :i[0],"comment": i[1]})
 
+        if comment_contSingleline:
+            for idx,i in enumerate(comment_contSingleline[1]):
+                output['cont_single_line_comment'].append({"start_line": comment_contSingleline[1][idx], "end_line": comment_contSingleline[2][idx], "comment": comment_contSingleline[3][idx]})
+
         if comment_multiline:
             for idx,i in enumerate(comment_multiline[0]):
                 output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})

diff --git a/extractor/languages/tests/test_cpp.py b/extractor/languages/tests/test_cpp.py
@@ -1,7 +1,7 @@
 import unittest
 import re, os
 from languages import cpp
-from binder import readSingleLine,readMultiLineDiff
+from binder import readSingleLine,readMultiLineDiff,contSingleLines
 
 class CPPTest(unittest.TestCase):
 
@@ -10,11 +10,13 @@ def test_output(self):
         regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
         self.syntax_start = "/*"
         self.syntax_end ='*/'
-        comment_single = cpp.readSingleLine(path,regex)
+        sign = '//'
+        comment_single = cpp.readSingleLine(path,regex,sign)
         comment_multiline = cpp.readMultiLineDiff(path,self.syntax_start,self.syntax_end)
-
+        comment_contSingleline = cpp.contSingleLines(comment_single)
         self.assertTrue(comment_single)
         self.assertTrue(comment_multiline)
+        self.assertTrue(comment_contSingleline)
 
 
 
@@ -23,9 +25,11 @@ def test_outputFormat(self):
         regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
         self.syntax_start = "/*"
         self.syntax_end ='*/'
+        sign = '//'
         expected = cpp.cppExtractor(path)
-        comment_single = readSingleLine(path,regex)
+        comment_single = readSingleLine(path,regex,sign)
         comment_multiline = readMultiLineDiff(path,self.syntax_start,self.syntax_end)
+        comment_contSingleline = contSingleLines(comment_single)
         file = path.split("/")
         output = {
         "metadata": [{
@@ -37,15 +41,27 @@ def test_outputFormat(self):
         "sloc": comment_single[1]-(comment_single[3]+comment_multiline[3]+comment_single[2])
         }],
         "single_line_comment": [],
+        "cont_single_line_comment": [],
         "multi_line_comment": []
-    }
+        }
+
+        if comment_contSingleline:
+            comment_single = comment_contSingleline[0]
+
         if comment_single:
             for i in comment_single[0]:
                 output['single_line_comment'].append({"line_number" :i[0],"comment": i[1]})
 
+        if comment_contSingleline:
+            for idx,i in enumerate(comment_contSingleline[1]):
+                output['cont_single_line_comment'].append({"start_line": comment_contSingleline[1][idx], "end_line": comment_contSingleline[2][idx], "comment": comment_contSingleline[3][idx]})
+
         if comment_multiline:
-            for idx,i in enumerate(comment_multiline[0]):
-                output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})
+            try:
+                for idx,i in enumerate(comment_multiline[0]):
+                    output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})
+            except:
+                pass
 
         self.assertEqual(output,expected)  
 

diff --git a/extractor/languages/tests/test_css.py b/extractor/languages/tests/test_css.py
@@ -35,8 +35,11 @@ def test_outputFormat(self):
         "multi_line_comment": []
         }
         if comment_multiline:
-            for idx,i in enumerate(comment_multiline[0]):
-                output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})
+            try:
+                for idx,i in enumerate(comment_multiline[0]):
+                    output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})
+            except:
+                pass
 
         self.assertEqual(output,expected)  
 

diff --git a/extractor/languages/tests/test_go.py b/extractor/languages/tests/test_go.py
@@ -1,7 +1,7 @@
 import unittest
 import re, os
 from languages import go
-from binder import readSingleLine,readMultiLineDiff
+from binder import readSingleLine,readMultiLineDiff,contSingleLines
 
 class GoTest(unittest.TestCase):
 
@@ -10,11 +10,13 @@ def test_output(self):
         regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
         self.syntax_start = "/*"
         self.syntax_end ='*/'
-        comment_single = go.readSingleLine(path,regex)
+        sign = '//'
+        comment_single = go.readSingleLine(path,regex,sign)
         comment_multiline = go.readMultiLineDiff(path,self.syntax_start,self.syntax_end)
-
+        comment_contSingleline = go.contSingleLines(comment_single)
         self.assertTrue(comment_single)
         self.assertTrue(comment_multiline)
+        self.assertTrue(comment_contSingleline)
 
 
 
@@ -23,9 +25,11 @@ def test_outputFormat(self):
         regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
         self.syntax_start = "/*"
         self.syntax_end ='*/'
+        sign = '//'
         expected = go.goExtractor(path)
-        comment_single = readSingleLine(path,regex)
+        comment_single = readSingleLine(path,regex,sign)
         comment_multiline = readMultiLineDiff(path,self.syntax_start,self.syntax_end)
+        comment_contSingleline = contSingleLines(comment_single)
         file = path.split("/")
         output = {
         "metadata": [{
@@ -37,12 +41,21 @@ def test_outputFormat(self):
         "sloc": comment_single[1]-(comment_single[3]+comment_multiline[3]+comment_single[2])
         }],
         "single_line_comment": [],
+        "cont_single_line_comment": [],
         "multi_line_comment": []
         }
+
+        if comment_contSingleline:
+            comment_single = comment_contSingleline[0]
+
         if comment_single:
             for i in comment_single[0]:
                 output['single_line_comment'].append({"line_number" :i[0],"comment": i[1]})
 
+        if comment_contSingleline:
+            for idx,i in enumerate(comment_contSingleline[1]):
+                output['cont_single_line_comment'].append({"start_line": comment_contSingleline[1][idx], "end_line": comment_contSingleline[2][idx], "comment": comment_contSingleline[3][idx]})
+
         if comment_multiline:
             for idx,i in enumerate(comment_multiline[0]):
                 output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})