Merge pull request #327 from fasrc/cp_sfpathcheck

Require path match for allocation pipelines
fasrc · Aug 22, 2024 · 91972b8 · 91972b8
2 parents 0e70d88 + f204250
commit 91972b8
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 72 deletions.
diff --git a/coldfront/core/utils/fasrc.py b/coldfront/core/utils/fasrc.py
@@ -50,34 +50,30 @@ def sort_by(list1, sorter, how='attr'):
             raise Exception('unclear sorting method')
     return is_true, is_false
 
-def select_one_project_allocation(project_obj, resource_obj, dirpath=None):
+def select_one_project_allocation(project_obj, resource_obj, dirpath):
     """
-    Get one allocation for a given project/resource pairing; handle return of
+    Get one allocation for a given project/resource/path pairing; handle return of
     zero or multiple allocations.
 
-    If multiple allocations are in allocation_query, choose the one with the
-    similar dirpath.
-
     Parameters
     ----------
     project_obj
     resource_obj
+    dirpath
     """
     filter_vals = {'resources__id': resource_obj.id}
-    # if dirpath:
-    #     filter_vals['allocationattribute__value'] = dirpath
     allocation_query = project_obj.allocation_set.filter(**filter_vals)
-    if allocation_query.count() == 1:
-        allocation_obj = allocation_query.first()
-        if allocation_obj.path and dirpath and allocation_obj.path not in dirpath and dirpath not in allocation_obj.path:
-            return None
-    elif allocation_query.count() < 1:
-        allocation_obj = None
-    elif allocation_query.count() > 1:
-        allocation_obj = next((a for a in allocation_query if a.path.lower() in dirpath.lower()),
-                                None)
-    return allocation_obj
-
+    if allocation_query.count() < 1:
+        return None
+    else:
+        allocations = [
+            a for a in allocation_query
+            if a.path and dirpath and (a.path in dirpath or dirpath in a.path)
+        ]
+        if len(allocations) == 1:
+            return allocations[0]
+        elif len(allocations) > 1:
+            raise Exception('multiple allocations found for project/resource/path pairing')
 
 def determine_size_fmt(byte_num):
     """Return the correct human-readable byte measurement.
@@ -86,9 +82,9 @@ def determine_size_fmt(byte_num):
     for u in units:
         unit = u
         if abs(byte_num) < 1024.0:
-            return round(byte_num, 3), unit
+            return (round(byte_num, 3), unit)
         byte_num /= 1024.0
-    return(round(byte_num, 3), unit)
+    return (round(byte_num, 3), unit)
 
 def convert_size_fmt(num, target_unit, source_unit='B'):
     units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB']

diff --git a/coldfront/plugins/fasrc/management/commands/id_import_new_allocations.py b/coldfront/plugins/fasrc/management/commands/id_import_new_allocations.py
@@ -75,7 +75,7 @@ def handle(self, *args, **options):
                 lab_path = entry['fs_path'].replace(f'/n/{entry["server"]}/', '')
 
                 resource = Resource.objects.get(name__contains=entry['server'])
-                alloc_obj = select_one_project_allocation(project, resource, dirpath=lab_path)
+                alloc_obj = select_one_project_allocation(project, resource, lab_path)
                 if alloc_obj is not None:
                     continue
                 lab_usage_entries = [

diff --git a/coldfront/plugins/fasrc/utils.py b/coldfront/plugins/fasrc/utils.py
@@ -28,12 +28,13 @@ def produce_query_statement(self, vol_type, volumes=None):
 
         query_dict = {
             'quota': {
+                'volumes': '|'.join(r.name.split('/')[0] for r in Resource.objects.filter(name__contains='tier0')),
                 'relation': 'HasQuota',
                 'match': "(e:Quota) MATCH (d:ConfigValue {Name: 'Quota.Invocation'})",
                 'server': 'filesystem',
                 'validation_query':
-                    "NOT ((e.SizeGB IS null) OR (e.usedBytes = 0 AND e.SizeGB = 1024)) \
-                    AND (datetime() - duration('P31D') <= datetime(r.DotsLFSUpdateDate)) \
+                    "NOT ((e.SizeGB IS null) OR (e.usedBytes = 0 AND e.SizeGB = 1024))\
+                    AND (datetime() - duration('P31D') <= datetime(r.DotsLFSUpdateDate))\
                     AND NOT (e.Path IS null)",
                 'r_updated': 'DotsLFSUpdateDate',
                 'storage_type': 'Quota',
@@ -42,17 +43,18 @@ def produce_query_statement(self, vol_type, volumes=None):
                 'usedbytes': 'usedBytes',
                 'fs_path': 'Path',
                 'server_replace': '/n/',
-                'path_replace': '/n//',
+                'path_def': "substring(e.Path, size('/n/') + size(split(e.Path, '/')[2]) + 1)",
                 'unique':'datetime(e.DotsLFSUpdateDate) as begin_date'
             },
             'isilon': {
+                'volumes': '|'.join(r.name.split('/')[0] for r in Resource.objects.filter(name__contains='tier1')),
                 'relation': 'Owns',
                 'match': "(e:IsilonPath) MATCH (d:ConfigValue {Name: 'IsilonPath.Invocation'})",
                 'server': 'Isilon',
-                'validation_query': "r.DotsUpdateDate = d.DotsUpdateDate \
+                'validation_query': "r.DotsUpdateDate = d.DotsUpdateDate\
                     AND NOT (e.Path =~ '.*/rc_admin/.*')\
                     AND (e.Path =~ '.*labs.*')\
-                    AND (datetime() - duration('P31D') <= datetime(r.DotsUpdateDate)) \
+                    AND (datetime() - duration('P31D') <= datetime(r.DotsUpdateDate))\
                     AND NOT (e.SizeGB = 0)",
                 'fs_path':'Path',
                 'r_updated': 'DotsUpdateDate',
@@ -61,48 +63,43 @@ def produce_query_statement(self, vol_type, volumes=None):
                 'sizebytes': 'SizeBytes',
                 'usedbytes': 'UsedBytes',
                 'server_replace': '01.rc.fas.harvard.edu',
-                'path_replace': '/ifs/',
+                'path_def': "replace(e.Path, '/ifs/', '')",
                 'unique': 'datetime(e.DotsUpdateDate) as begin_date'
             },
             'volume': {
+                'volumes': '|'.join(r.name.split('/')[0] for r in Resource.objects.filter(name__contains='tier2')),
                 'relation': 'Owns',
                 'match': '(e:Volume)',
                 'server': 'Hostname',
                 'validation_query': 'NOT (e.SizeGB = 0)',
                 'r_updated': 'DotsLVSUpdateDate',
                 'storage_type': 'Volume',
                 'fs_path': 'LogicalVolume',
-                'path_replace': '/dev/data/',
+                'path_def': "replace(e.LogicalVolume, '/dev/data/', '')",
                 'usedgb': 'UsedGB',
                 'sizebytes': 'SizeGB * 1000000000',
                 'usedbytes': 'UsedGB * 1000000000',
                 'server_replace': '.rc.fas.harvard.edu',
-                'unique': 'datetime(e.DotsLVSUpdateDate) as update_date, \
+                'unique': 'datetime(e.DotsLVSUpdateDate) as update_date,\
                         datetime(e.DotsLVDisplayUpdateDate) as display_date'
             },
         }
         d = query_dict[vol_type]
 
-        if not volumes:
-            volumes = [
-                r.name.split('/')[0] for r in Resource.objects.filter(resource_type__name='Storage')
-            ]
-        volumes = '|'.join(volumes)
-        where = f"(e.{d['server']} =~ '.*({volumes}).*')"
         statement = {
-            'statement': f"MATCH p=(g:Group)-[r:{d['relation']}]-{d['match']} \
-            WHERE {where} AND {d['validation_query']}\
+            'statement': f"MATCH p=(g:Group)-[r:{d['relation']}]-{d['match']}\
+            WHERE (e.{d['server']} =~ '.*({d['volumes']}).*') AND {d['validation_query']}\
             AND NOT (g.ADSamAccountName =~ '.*(disabled|rc_admin).*')\
-            RETURN \
-            {d['unique']}, \
+            RETURN\
+            {d['unique']},\
             g.ADSamAccountName as lab,\
-            (e.SizeGB / 1024.0) as tb_allocation, \
+            (e.SizeGB / 1024.0) as tb_allocation,\
             e.{d['sizebytes']} as byte_allocation,\
             e.{d['usedbytes']} as byte_usage,\
             (e.{d['usedgb']} / 1024.0) as tb_usage,\
-            replace(e.{d['fs_path']}, '{d['path_replace']}', '') as fs_path, \
-            '{d['storage_type']}' as storage_type, \
-            datetime(r.{d['r_updated']}) as rel_updated, \
+            {d['path_def']} as fs_path,\
+            '{d['storage_type']}' as storage_type,\
+            datetime(r.{d['r_updated']}) as rel_updated,\
             replace(e.{d['server']}, '{d['server_replace']}', '') as server"
         }
         self.queries['statements'].append(statement)
@@ -141,14 +138,14 @@ def _standardize_nesefile(self):
         headers = headers_df.columns.values.tolist()
         data = pd.read_csv(datafile, names=headers, sep='\s+')
         data = data.loc[data['pool'].str.contains('1')]
+        data['fs_path'] = data['pool']
         data['lab'] = data['pool'].str.replace('1', '').str.replace('hugl', '').str.replace('hus3', '')
         data['server'] = 'nesetape'
         data['storage_type'] = 'tape'
         data['byte_allocation'] = data['mib_capacity'] * 1048576
         data['byte_usage'] = data['mib_used'] * 1048576
         data['tb_allocation'] = round(((data['mib_capacity']+ data['mib_capacity']*0.025) / 953674.3164), -1)
         data['tb_usage'] = data['mib_used'] / 953674.3164
-        data['fs_path'] = None
         data = data[[
             'lab', 'server', 'storage_type', 'byte_allocation',
             'byte_usage', 'tb_allocation', 'tb_usage', 'fs_path',
@@ -187,17 +184,17 @@ def stage_user_member_query(self, groupsearch, pi=False):
             return_vars = 'u.ADParentCanonicalName AS path, u.ADDepartment AS department'
         query = {'statements': [{
             'statement': f"MATCH {match_vars} (g.ADSamAccountName =~ '({groupsearch})')\
-                RETURN \
-                u.ADgivenName AS first_name, \
-                u.ADsurname AS last_name, \
-                u.ADSamAccountName AS user_name, \
-                u.ADenabled AS user_enabled, \
+                RETURN\
+                u.ADgivenName AS first_name,\
+                u.ADsurname AS last_name,\
+                u.ADSamAccountName AS user_name,\
+                u.ADenabled AS user_enabled,\
                 g.ADSamAccountName AS group_name,\
                 {return_vars},\
-                g.ADManaged_By AS group_manager, \
-                u.ADgidNumber AS user_gid_number, \
-                u.ADTitle AS title, \
-                u.ADCompany AS company, \
+                g.ADManaged_By AS group_manager,\
+                u.ADgidNumber AS user_gid_number,\
+                u.ADTitle AS title,\
+                u.ADCompany AS company,\
                 g.ADgidNumber AS group_gid_number"
         }]}
         resp_json = self.post_query(query)
@@ -247,37 +244,27 @@ def matched_dict_processing(allocation, data_dicts, paired_allocs, log_message):
 def pair_allocations_data(project, quota_dicts):
     """pair allocations with usage dicts"""
     logger = logging.getLogger('coldfront.import_quotas')
-    unpaired_allocs = project.allocation_set.filter(
+    allocs = project.allocation_set.filter(
         status__name__in=['Active','Pending Deactivation'],
         resources__resource_type__name='Storage'
     )
     paired_allocs = {}
     # first, pair allocations with those that have same
-    for allocation in unpaired_allocs:
+    for allocation in allocs:
         dicts = [
             d for d in quota_dicts
-            if d['fs_path'] and allocation.path.lower() == d['fs_path'].lower()
+            if d['fs_path'] and d['fs_path'].lower() == allocation.path.replace('HDD/', '').replace('SSD-HGST/', '').replace('SSD/', '').lower()
             and d['server'] in allocation.resources.first().name
         ]
         if dicts:
             log_message = f'Path-based match: {allocation}, {allocation.path}, {dicts[0]}'
             paired_allocs = matched_dict_processing(allocation, dicts, paired_allocs, log_message)
-    unpaired_allocs = [
-        a for a in unpaired_allocs if a not in paired_allocs
-    ]
+    unpaired_allocs = [a for a in allocs if a not in paired_allocs]
     unpaired_dicts = [d for d in quota_dicts if d not in paired_allocs.values()]
-    for allocation in unpaired_allocs:
-        dicts = [
-            d for d in unpaired_dicts if d['server'] in allocation.resources.first().name
-        ]
-        if dicts:
-            log_message = f'Resource-based match: {allocation}, {dicts[0]}'
-            paired_allocs = matched_dict_processing(allocation, dicts, paired_allocs, log_message)
-    unpaired_allocs = [
-        a for a in unpaired_allocs if a not in paired_allocs and a.status.name == 'Active'
-    ]
-    unpaired_dicts = [d for d in unpaired_dicts if d not in paired_allocs.values()]
     if unpaired_dicts or unpaired_allocs:
+        print(
+            f"unpaired allocation data. Allocation: {unpaired_allocs} | Dict: {unpaired_dicts}"
+        )
         logger.warning(
             "unpaired allocation data. Allocation: %s | Dict: %s", unpaired_allocs, unpaired_dicts
         )

diff --git a/coldfront/plugins/sftocf/utils.py b/coldfront/plugins/sftocf/utils.py
@@ -192,7 +192,6 @@ def get_zone_by_name(self, zone_name):
         zones = self.get_zones()
         return next((z for z in zones if z['name'] == zone_name), None)
 
-
     def create_zone(self, zone_name, paths, managers, managing_groups):
         """Create a zone via the API"""
         url = self.api_url + 'zone/'
@@ -616,7 +615,7 @@ def allocationquerymatches(self):
         total_sort_key = itemgetter('path','volume')
         allocation_usage_grouped = return_dict_of_groupings(self.sf_usage_data, total_sort_key)
         missing_allocations = [
-            (k,a) for k, a in allocation_usage_grouped if k not in allocation_list
+            (k,a) for k, a in allocation_usage_grouped if (a, k) not in allocation_list
         ]
         print("missing_allocations:", missing_allocations)
         logger.warning('starfish allocations missing in coldfront: %s', missing_allocations)