Skip to content

Commit

Permalink
Merge pull request #2806 from chaoss/pr-file-patch
Browse files Browse the repository at this point in the history
Pr file patch into MAIN
  • Loading branch information
sgoggins authored May 23, 2024
2 parents 5384e0f + fcbb819 commit d8ea7c8
Show file tree
Hide file tree
Showing 13 changed files with 117 additions and 155 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Augur NEW Release v0.63.3
# Augur NEW Release v0.70.0

Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else!
The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io
Expand All @@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o
## NEW RELEASE ALERT!
### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md)

Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.63.3
Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.70.0

- The `main` branch is a stable version of our new architecture, which features:
- Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks.
Expand Down
2 changes: 1 addition & 1 deletion augur/application/cli/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def determine_worker_processes(ratio,maximum):
sleep_time += 6

#20% of estimate, Maximum value of 25
secondary_num_processes = determine_worker_processes(.25, 25)
secondary_num_processes = determine_worker_processes(.25, 45)
logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}")
secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
process_list.append(subprocess.Popen(secondary_worker.split(" ")))
Expand Down
2 changes: 1 addition & 1 deletion augur/application/cli/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def determine_worker_processes(ratio,maximum):
sleep_time += 6

#20% of estimate, Maximum value of 25
secondary_num_processes = determine_worker_processes(.25, 25)
secondary_num_processes = determine_worker_processes(.25, 45)
logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}")
secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
process_list.append(subprocess.Popen(secondary_worker.split(" ")))
Expand Down
2 changes: 1 addition & 1 deletion augur/application/cli/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def start():

scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling"
core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n core:{uuid.uuid4().hex}@%h"
secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=25 -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n secondary:{uuid.uuid4().hex}@%h -Q secondary"

scheduling_worker_process = subprocess.Popen(scheduling_worker.split(" "))
core_worker_process = subprocess.Popen(core_worker.split(" "))
Expand Down
6 changes: 5 additions & 1 deletion augur/tasks/git/dependency_tasks/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,11 @@ def generate_scorecard(session,repo_id,path):
key_handler = GithubApiKeyHandler(session, session.logger)
os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key()

required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard)
try:
required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard)
except Exception as e:
session.logger.error(f"Could not parse required output! Error: {e}")
raise e

session.logger.info('adding to database...')
session.logger.debug(f"output: {required_output}")
Expand Down
26 changes: 14 additions & 12 deletions augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,8 @@ def rebuild_unknown_affiliation_and_web_caches(session):
# ("DELETE c.* FROM dm_repo_group_weekly c "
# "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE "
# "p.rg_recache=TRUE")
session.execute_sql(clear_dm_repo_group_weekly)

# session.execute_sql(clear_dm_repo_group_weekly)

clear_dm_repo_group_monthly = s.sql.text("""
DELETE
Expand All @@ -410,7 +411,8 @@ def rebuild_unknown_affiliation_and_web_caches(session):
# ("DELETE c.* FROM dm_repo_group_monthly c "
# "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE "
# "p.rg_recache=TRUE")
session.execute_sql(clear_dm_repo_group_monthly)

# session.execute_sql(clear_dm_repo_group_monthly)

clear_dm_repo_group_annual = s.sql.text("""
DELETE
Expand All @@ -424,7 +426,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
# ("DELETE c.* FROM dm_repo_group_annual c "
# "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE "
# "p.rg_recache=TRUE")
session.execute_sql(clear_dm_repo_group_annual)
# session.execute_sql(clear_dm_repo_group_annual)

clear_dm_repo_weekly = s.sql.text("""
DELETE
Expand All @@ -441,7 +443,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
# "JOIN repo r ON c.repo_id = r.repo_id "
# "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE "
# "p.rg_recache=TRUE")
session.execute_sql(clear_dm_repo_weekly)
# session.execute_sql(clear_dm_repo_weekly)

clear_dm_repo_monthly = s.sql.text("""
DELETE
Expand All @@ -458,7 +460,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
# "JOIN repo r ON c.repo_id = r.repo_id "
# "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE "
# "p.rg_recache=TRUE")
session.execute_sql(clear_dm_repo_monthly)
# session.execute_sql(clear_dm_repo_monthly)

clear_dm_repo_annual = s.sql.text("""
DELETE
Expand All @@ -475,7 +477,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
# "JOIN repo r ON c.repo_id = r.repo_id "
# "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE "
# "p.rg_recache=TRUE")
session.execute_sql(clear_dm_repo_annual)
# session.execute_sql(clear_dm_repo_annual)

clear_unknown_cache = s.sql.text("""
DELETE
Expand Down Expand Up @@ -573,7 +575,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
"r.repo_group_id, info.a, info.b, info.c")
).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source)

session.execute_sql(cache_projects_by_week)
# session.execute_sql(cache_projects_by_week)

cache_projects_by_month = s.sql.text(
("INSERT INTO dm_repo_group_monthly (repo_group_id, email, affiliation, month, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source) "
Expand Down Expand Up @@ -609,7 +611,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
"r.repo_group_id, info.a, info.b, info.c"
)).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source)

session.execute_sql(cache_projects_by_month)
# session.execute_sql(cache_projects_by_month)

cache_projects_by_year = s.sql.text((
"INSERT INTO dm_repo_group_annual (repo_group_id, email, affiliation, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source) "
Expand Down Expand Up @@ -649,7 +651,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):



session.execute_sql(cache_projects_by_year)
# session.execute_sql(cache_projects_by_year)
# Start caching by repo

session.log_activity('Verbose','Caching repos')
Expand Down Expand Up @@ -689,7 +691,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
"a.repo_id, info.a, info.b, info.c"
)).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source)

session.execute_sql(cache_repos_by_week)
# session.execute_sql(cache_repos_by_week)

cache_repos_by_month = s.sql.text((
"INSERT INTO dm_repo_monthly (repo_id, email, affiliation, month, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)"
Expand Down Expand Up @@ -725,7 +727,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
"a.repo_id, info.a, info.b, info.c"
)).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source)

session.execute_sql(cache_repos_by_month)
# session.execute_sql(cache_repos_by_month)

cache_repos_by_year = s.sql.text((
"INSERT INTO dm_repo_annual (repo_id, email, affiliation, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)"
Expand Down Expand Up @@ -759,7 +761,7 @@ def rebuild_unknown_affiliation_and_web_caches(session):
"a.repo_id, info.a, info.b, info.c"
)).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source)

session.execute_sql(cache_repos_by_year)
# session.execute_sql(cache_repos_by_year)

# Reset cache flags

Expand Down
35 changes: 23 additions & 12 deletions augur/tasks/github/pull_requests/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from augur.application.db.util import execute_session_query
from ..messages.tasks import process_github_comment_contributors

from typing import Generator, List, Dict


platform_id = 1

Expand All @@ -29,20 +31,32 @@ def collect_pull_requests(repo_git: str) -> int:
Repo.repo_git == repo_git).one().repo_id

owner, repo = get_owner_repo(repo_git)
pr_data = retrieve_all_pr_data(repo_git, logger, manifest.key_auth)

if pr_data:
process_pull_requests(pr_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db)
total_count = 0
all_data = []
for page in retrieve_all_pr_data(repo_git, logger, manifest.key_auth):
all_data += page

if len(all_data) >= 1000:
process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db)
total_count += len(all_data)
all_data.clear()

if len(all_data):
process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db)
total_count += len(all_data)

return len(pr_data)
if total_count > 0:
return total_count
else:
logger.info(f"{owner}/{repo} has no pull requests")
return 0



# TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers
# TODO: Fix column names in pull request labels table
def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None:
def retrieve_all_pr_data(repo_git: str, logger, key_auth): #-> Generator[List[Dict]]:

owner, repo = get_owner_repo(repo_git)

Expand All @@ -52,24 +66,21 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None:
# returns an iterable of all prs at this url (this essentially means you can treat the prs variable as a list of the prs)
prs = GithubPaginator(url, key_auth, logger)

all_data = []
num_pages = prs.get_num_pages()
for page_data, page in prs.iter_pages():

if page_data is None:
return all_data
return

if len(page_data) == 0:
logger.debug(
f"{owner}/{repo} Prs Page {page} contains no data...returning")
logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}")
return all_data
return

logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}")

all_data += page_data

return all_data

yield page_data


def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db):
Expand Down
20 changes: 12 additions & 8 deletions augur/tasks/github/util/gh_graphql_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,17 +338,21 @@ def __iter__(self):
#self.logger.info(f"{params}")
data = self.request_graphql_dict(variables=params)
try:
coreData = self.extract_paginate_result(data)

#Check to make sure we have data
coreData['totalCount']
coreData = self.extract_paginate_result(data)
if coreData is not None:
if coreData.get('totalCount') is not None:
self.logger.info("... core data obtained")
else:
self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.")
coreData['totalCount'] = 0
else:
self.logger.error("Core data is None, cannot proceed with operations on it, but assigning a value of Zero to ensure continued collection.")
yield None
return
except KeyError as e:
self.logger.error("Could not extract paginate result because there was no data returned")
self.logger.error(
''.join(traceback.format_exception(None, e, e.__traceback__)))

self.logger.info(f"Graphql paramters: {params}")
return
self.logger.error(''.join(traceback.format_exception(None, e, e.__traceback__)))


if int(coreData['totalCount']) == 0:
Expand Down
Loading

0 comments on commit d8ea7c8

Please sign in to comment.