Skip to content

Commit

Permalink
Merge pull request #8 from fossology/HastagAB/Developement
Browse files Browse the repository at this point in the history
Add Text File Support and Bug Fix
  • Loading branch information
Kaushl2208 committed Jul 26, 2020
2 parents f9d6374 + 8818e80 commit 37ad2c2
Show file tree
Hide file tree
Showing 30 changed files with 474 additions and 154 deletions.
6 changes: 6 additions & 0 deletions extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from extractor.main import *

def extract(file):
return file_runner(file)

__all__ = ['file_runner','extract', 'langIdentifier']
5 changes: 4 additions & 1 deletion extractor/binder.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ def readSingleLine(file, regex, sign):
if line:
if line[0] == sign:
line_of_comments += 1
elif line[0:2] == sign:
line_of_comments += 1

if not line.strip():
blank_lines += 1
Expand Down Expand Up @@ -114,10 +116,11 @@ def readMultiLineDiff(file, startSyntax: str, endSyntax: str):
content = ""
endLine.append(lineNumber)
if copy:
line_of_comments += 1
content = content + line.replace('\n',' ')
if not line.strip():
blank_lines += 1
for idx, i in enumerate(endLine):
line_of_comments = line_of_comments + (endLine[idx]-startLine[idx]) + 1
line_of_comments += len(output)
output = [s.strip(startSyntax) for s in output]
output = [s.strip(endSyntax) for s in output]
Expand Down
2 changes: 1 addition & 1 deletion extractor/languages/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__all__ = ["c", "c_sharp", "cpp", "css", "go", "haskell", "html", "java", "javascript", "kotlin", "matlab", "perl", "php", "python", "r", "ruby", "rust", "scala", "shell", "swift"]
__all__ = ["c", "c_sharp", "cpp", "css", "go", "haskell", "html", "java", "javascript", "kotlin", "matlab", "perl", "php", "python", "r", "ruby", "rust", "scala", "shell", "swift", "text"]
7 changes: 5 additions & 2 deletions extractor/languages/c.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,11 @@ def cExtractor(file):
output['cont_single_line_comment'].append({"start_line": result4[1][idx], "end_line": result4[2][idx], "comment": result4[3][idx]})

if result2:
for idx,i in enumerate(result2[0]):
output['multi_line_comment'].append({"start_line": result2[0][idx], "end_line": result2[1][idx], "comment": result2[2][idx]})
try:
for idx,i in enumerate(result2[0]):
output['multi_line_comment'].append({"start_line": result2[0][idx], "end_line": result2[1][idx], "comment": result2[2][idx]})
except:
pass

return output

Expand Down
7 changes: 5 additions & 2 deletions extractor/languages/css.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,11 @@ def cssExtractor(file):
"multi_line_comment": []
}
if result1:
for idx,i in enumerate(result1[0]):
output['multi_line_comment'].append({"start_line": result1[0][idx], "end_line": result1[1][idx], "comment": result1[2][idx]})
try:
for idx,i in enumerate(result1[0]):
output['multi_line_comment'].append({"start_line": result1[0][idx], "end_line": result1[1][idx], "comment": result1[2][idx]})
except:
pass
return output


Expand Down
24 changes: 11 additions & 13 deletions extractor/languages/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def htmlExtractor(file):
result = CommentSyntax()
result1 = result.gtExclamationDash(file)
result2 = result.slashStar(file)
result4 = contSingleLines(result1)
file = file.split("/")
output = {
"metadata": [{
Expand All @@ -42,22 +41,21 @@ def htmlExtractor(file):
"multi_line_comment": []
}

if result4:
result1 = result4[0]

if result1:
for i in result1[0]:
output['single_line_comment'].append({"line_number" :i[0],"comment": i[1]})

if result4:
for idx,i in enumerate(result4[1]):
output['cont_single_line_comment'].append({"start_line": result4[1][idx], "end_line": result4[2][idx], "comment": result4[3][idx]})

try:
for idx,i in enumerate(result1[0]):
output['multi_line_comment'].append({"start_line": result1[0][idx], "end_line": result1[1][idx], "comment": result1[2][idx]})
except:
pass

if result2:
for idx,i in enumerate(result2[0]):
output['multi_line_comment'].append({"start_line": result2[0][idx], "end_line": result2[1][idx], "comment": result2[2][idx]})
try:
for idx,i in enumerate(result2[0]):
output['multi_line_comment'].append({"start_line": result2[0][idx], "end_line": result2[1][idx], "comment": result2[2][idx]})
except:
pass


return output


Expand Down
7 changes: 5 additions & 2 deletions extractor/languages/javascript.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,11 @@ def javascriptExtractor(file):
output['cont_single_line_comment'].append({"start_line": result4[1][idx], "end_line": result4[2][idx], "comment": result4[3][idx]})

if result2:
for idx,i in enumerate(result2[0]):
output['multi_line_comment'].append({"start_line": result2[0][idx], "end_line": result2[1][idx], "comment": result2[2][idx]})
try:
for idx,i in enumerate(result2[0]):
output['multi_line_comment'].append({"start_line": result2[0][idx], "end_line": result2[1][idx], "comment": result2[2][idx]})
except:
pass


return output
Expand Down
31 changes: 23 additions & 8 deletions extractor/languages/tests/test_c.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
import unittest
import re, os
from languages import c
from binder import readSingleLine,readMultiLineDiff
from binder import readSingleLine,readMultiLineDiff,contSingleLines

class CTest(unittest.TestCase):

def test_output(self):
path = os.path.join(os.getcwd(),"languages/tests/TestFiles/textcomment.c")
regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
sign = '//'
self.syntax_start = "/*"
self.syntax_end ='*/'
comment_single = c.readSingleLine(path,regex)
comment_single = c.readSingleLine(path,regex,sign)
comment_multiline = c.readMultiLineDiff(path,self.syntax_start,self.syntax_end)

comment_contSinglelines = c.contSingleLines(comment_single)
self.assertTrue(comment_single)
self.assertTrue(comment_multiline)
self.assertTrue(comment_contSinglelines)



Expand All @@ -23,9 +25,11 @@ def test_outputFormat(self):
regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
self.syntax_start = "/*"
self.syntax_end = "*/"
sign = '//'
expected = c.cExtractor(path)
comment_single = readSingleLine(path,regex)
comment_single = readSingleLine(path,regex,sign)
comment_multiline = readMultiLineDiff(path,self.syntax_start,self.syntax_end)
comment_contSinglelines = contSingleLines(comment_single)
file = path.split("/")
output = {
"metadata": [{
Expand All @@ -37,16 +41,27 @@ def test_outputFormat(self):
"sloc": comment_single[1]-(comment_single[3]+comment_multiline[3]+comment_single[2])
}],
"single_line_comment": [],
"cont_single_line_comment": [],
"multi_line_comment": []
}
}

if comment_contSinglelines:
comment_single = comment_contSinglelines[0]

if comment_single:
for i in comment_single[0]:
output['single_line_comment'].append({"line_number" :i[0],"comment": i[1]})

if comment_multiline:
for idx,i in enumerate(comment_multiline[0]):
output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})
if comment_contSinglelines:
for idx,i in enumerate(comment_contSinglelines[1]):
output['cont_single_line_comment'].append({"start_line": comment_contSinglelines[1][idx], "end_line": comment_contSinglelines[2][idx], "comment": comment_contSinglelines[3][idx]})

if comment_multiline:
try:
for idx,i in enumerate(comment_multiline[0]):
output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})
except:
pass
self.assertEqual(output,expected)

def test_Source(self):
Expand Down
22 changes: 18 additions & 4 deletions extractor/languages/tests/test_c_sharp.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unittest
import re, os
from languages import c_sharp
from binder import readSingleLine,readMultiLineDiff
from binder import readSingleLine,readMultiLineDiff,contSingleLines

class CSharpTest(unittest.TestCase):

Expand All @@ -10,11 +10,14 @@ def test_output(self):
regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
self.syntax_start = "/*"
self.syntax_end ='*/'
comment_single = c_sharp.readSingleLine(path,regex)
sign = '//'
comment_single = c_sharp.readSingleLine(path,regex,sign)
comment_multiline = c_sharp.readMultiLineDiff(path,self.syntax_start,self.syntax_end)
comment_contSingleline = c_sharp.contSingleLines(comment_single)

self.assertTrue(comment_single)
self.assertTrue(comment_multiline)
self.assertTrue(comment_contSingleline)



Expand All @@ -23,9 +26,11 @@ def test_outputFormat(self):
regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
self.syntax_start = "/*"
self.syntax_end ='*/'
sign = '//'
expected = c_sharp.c_sharpExtractor(path)
comment_single = readSingleLine(path,regex)
comment_single = readSingleLine(path,regex,sign)
comment_multiline = readMultiLineDiff(path,self.syntax_start,self.syntax_end)
comment_contSingleline = contSingleLines(comment_single)
file = path.split("/")
output = {
"metadata": [{
Expand All @@ -37,12 +42,21 @@ def test_outputFormat(self):
"sloc": comment_single[1]-(comment_single[3]+comment_multiline[3]+comment_single[2])
}],
"single_line_comment": [],
"cont_single_line_comment": [],
"multi_line_comment": []
}
}

if comment_contSingleline:
comment_single = comment_contSingleline[0]

if comment_single:
for i in comment_single[0]:
output['single_line_comment'].append({"line_number" :i[0],"comment": i[1]})

if comment_contSingleline:
for idx,i in enumerate(comment_contSingleline[1]):
output['cont_single_line_comment'].append({"start_line": comment_contSingleline[1][idx], "end_line": comment_contSingleline[2][idx], "comment": comment_contSingleline[3][idx]})

if comment_multiline:
for idx,i in enumerate(comment_multiline[0]):
output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})
Expand Down
30 changes: 23 additions & 7 deletions extractor/languages/tests/test_cpp.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unittest
import re, os
from languages import cpp
from binder import readSingleLine,readMultiLineDiff
from binder import readSingleLine,readMultiLineDiff,contSingleLines

class CPPTest(unittest.TestCase):

Expand All @@ -10,11 +10,13 @@ def test_output(self):
regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
self.syntax_start = "/*"
self.syntax_end ='*/'
comment_single = cpp.readSingleLine(path,regex)
sign = '//'
comment_single = cpp.readSingleLine(path,regex,sign)
comment_multiline = cpp.readMultiLineDiff(path,self.syntax_start,self.syntax_end)

comment_contSingleline = cpp.contSingleLines(comment_single)
self.assertTrue(comment_single)
self.assertTrue(comment_multiline)
self.assertTrue(comment_contSingleline)



Expand All @@ -23,9 +25,11 @@ def test_outputFormat(self):
regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
self.syntax_start = "/*"
self.syntax_end ='*/'
sign = '//'
expected = cpp.cppExtractor(path)
comment_single = readSingleLine(path,regex)
comment_single = readSingleLine(path,regex,sign)
comment_multiline = readMultiLineDiff(path,self.syntax_start,self.syntax_end)
comment_contSingleline = contSingleLines(comment_single)
file = path.split("/")
output = {
"metadata": [{
Expand All @@ -37,15 +41,27 @@ def test_outputFormat(self):
"sloc": comment_single[1]-(comment_single[3]+comment_multiline[3]+comment_single[2])
}],
"single_line_comment": [],
"cont_single_line_comment": [],
"multi_line_comment": []
}
}

if comment_contSingleline:
comment_single = comment_contSingleline[0]

if comment_single:
for i in comment_single[0]:
output['single_line_comment'].append({"line_number" :i[0],"comment": i[1]})

if comment_contSingleline:
for idx,i in enumerate(comment_contSingleline[1]):
output['cont_single_line_comment'].append({"start_line": comment_contSingleline[1][idx], "end_line": comment_contSingleline[2][idx], "comment": comment_contSingleline[3][idx]})

if comment_multiline:
for idx,i in enumerate(comment_multiline[0]):
output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})
try:
for idx,i in enumerate(comment_multiline[0]):
output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})
except:
pass

self.assertEqual(output,expected)

Expand Down
7 changes: 5 additions & 2 deletions extractor/languages/tests/test_css.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,11 @@ def test_outputFormat(self):
"multi_line_comment": []
}
if comment_multiline:
for idx,i in enumerate(comment_multiline[0]):
output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})
try:
for idx,i in enumerate(comment_multiline[0]):
output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})
except:
pass

self.assertEqual(output,expected)

Expand Down
21 changes: 17 additions & 4 deletions extractor/languages/tests/test_go.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unittest
import re, os
from languages import go
from binder import readSingleLine,readMultiLineDiff
from binder import readSingleLine,readMultiLineDiff,contSingleLines

class GoTest(unittest.TestCase):

Expand All @@ -10,11 +10,13 @@ def test_output(self):
regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
self.syntax_start = "/*"
self.syntax_end ='*/'
comment_single = go.readSingleLine(path,regex)
sign = '//'
comment_single = go.readSingleLine(path,regex,sign)
comment_multiline = go.readMultiLineDiff(path,self.syntax_start,self.syntax_end)

comment_contSingleline = go.contSingleLines(comment_single)
self.assertTrue(comment_single)
self.assertTrue(comment_multiline)
self.assertTrue(comment_contSingleline)



Expand All @@ -23,9 +25,11 @@ def test_outputFormat(self):
regex = r'''(\/\/\s*[\w #\.()@+-_*\d]*)'''
self.syntax_start = "/*"
self.syntax_end ='*/'
sign = '//'
expected = go.goExtractor(path)
comment_single = readSingleLine(path,regex)
comment_single = readSingleLine(path,regex,sign)
comment_multiline = readMultiLineDiff(path,self.syntax_start,self.syntax_end)
comment_contSingleline = contSingleLines(comment_single)
file = path.split("/")
output = {
"metadata": [{
Expand All @@ -37,12 +41,21 @@ def test_outputFormat(self):
"sloc": comment_single[1]-(comment_single[3]+comment_multiline[3]+comment_single[2])
}],
"single_line_comment": [],
"cont_single_line_comment": [],
"multi_line_comment": []
}

if comment_contSingleline:
comment_single = comment_contSingleline[0]

if comment_single:
for i in comment_single[0]:
output['single_line_comment'].append({"line_number" :i[0],"comment": i[1]})

if comment_contSingleline:
for idx,i in enumerate(comment_contSingleline[1]):
output['cont_single_line_comment'].append({"start_line": comment_contSingleline[1][idx], "end_line": comment_contSingleline[2][idx], "comment": comment_contSingleline[3][idx]})

if comment_multiline:
for idx,i in enumerate(comment_multiline[0]):
output['multi_line_comment'].append({"start_line": comment_multiline[0][idx], "end_line": comment_multiline[1][idx], "comment": comment_multiline[2][idx]})
Expand Down
Loading

0 comments on commit 37ad2c2

Please sign in to comment.