Corey Morris
commited on
Commit
·
c32735e
1
Parent(s):
83a34f0
WIP commit. Finding files can be identical as the method in results_data_processor.
Browse files- details_data_processor.py +31 -1
- test_details_data_processing.py +14 -0
details_data_processor.py
CHANGED
|
@@ -10,12 +10,14 @@ class DetailsDataProcessor:
|
|
| 10 |
# Download
|
| 11 |
#url example https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/64bits/LexPodLM-13B/details_harness%7ChendrycksTest-moral_scenarios%7C5_2023-07-25T13%3A41%3A51.227672.json
|
| 12 |
|
| 13 |
-
def __init__(self, directory='results', pattern='
|
| 14 |
self.directory = directory
|
| 15 |
self.pattern = pattern
|
| 16 |
# self.data = self.process_data()
|
| 17 |
# self.ranked_data = self.rank_data()
|
| 18 |
|
|
|
|
|
|
|
| 19 |
# download a file from a single url and save it to a local directory
|
| 20 |
@staticmethod
|
| 21 |
def download_file(url, filename):
|
|
@@ -49,7 +51,35 @@ class DetailsDataProcessor:
|
|
| 49 |
constructed_url = base_url + organization + '/' + model + '/' + other_chunk + filename
|
| 50 |
return constructed_url
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
# @staticmethod
|
| 54 |
# def _find_files(directory, pattern):
|
| 55 |
# for root, dirs, files in os.walk(directory):
|
|
|
|
| 10 |
# Download
|
| 11 |
#url example https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/64bits/LexPodLM-13B/details_harness%7ChendrycksTest-moral_scenarios%7C5_2023-07-25T13%3A41%3A51.227672.json
|
| 12 |
|
| 13 |
+
def __init__(self, directory='results', pattern='moral*.json'):
|
| 14 |
self.directory = directory
|
| 15 |
self.pattern = pattern
|
| 16 |
# self.data = self.process_data()
|
| 17 |
# self.ranked_data = self.rank_data()
|
| 18 |
|
| 19 |
+
|
| 20 |
+
|
| 21 |
# download a file from a single url and save it to a local directory
|
| 22 |
@staticmethod
|
| 23 |
def download_file(url, filename):
|
|
|
|
| 51 |
constructed_url = base_url + organization + '/' + model + '/' + other_chunk + filename
|
| 52 |
return constructed_url
|
| 53 |
|
| 54 |
+
# @staticmethod
|
| 55 |
+
# def _find_files(directory, pattern):
|
| 56 |
+
# for root, dirs, files in os.walk(directory):
|
| 57 |
+
# for basename in files:
|
| 58 |
+
# if fnmatch.fnmatch(basename, pattern):
|
| 59 |
+
# filename = os.path.join(root, basename)
|
| 60 |
+
# yield filename
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _find_files(self, directory, pattern):
|
| 65 |
+
matching_files = [] # List to hold matching filenames
|
| 66 |
+
for root, dirs, files in os.walk(directory):
|
| 67 |
+
for basename in files:
|
| 68 |
+
if fnmatch.fnmatch(basename, pattern):
|
| 69 |
+
filename = os.path.join(root, basename)
|
| 70 |
+
matching_files.append(filename) # Append the matching filename to the list
|
| 71 |
+
return matching_files # Return the list of matching filenames
|
| 72 |
|
| 73 |
+
|
| 74 |
+
def pipeline(self):
|
| 75 |
+
dataframes = []
|
| 76 |
+
for file_path in self._find_files(self.directory, self.pattern):
|
| 77 |
+
print(file_path)
|
| 78 |
+
url = self.generate_url(file_path)
|
| 79 |
+
file_path = file_path.split('/')[-1]
|
| 80 |
+
df = self.single_file_pipeline(url, file_path)
|
| 81 |
+
dataframes.append(df)
|
| 82 |
+
return dataframes
|
| 83 |
# @staticmethod
|
| 84 |
# def _find_files(directory, pattern):
|
| 85 |
# for root, dirs, files in os.walk(directory):
|
test_details_data_processing.py
CHANGED
|
@@ -28,6 +28,20 @@ class TestDetailsDataProcessor(unittest.TestCase):
|
|
| 28 |
constructed_url = self.processor.generate_url(results_file_path)
|
| 29 |
self.assertEqual(expected_url, constructed_url)
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
if __name__ == '__main__':
|
| 33 |
unittest.main()
|
|
|
|
| 28 |
constructed_url = self.processor.generate_url(results_file_path)
|
| 29 |
self.assertEqual(expected_url, constructed_url)
|
| 30 |
|
| 31 |
+
def test_pipeline(self):
|
| 32 |
+
df = self.processor.pipeline()
|
| 33 |
+
print(100 * "****")
|
| 34 |
+
print(df)
|
| 35 |
+
self.assertIsInstance(df, pd.DataFrame)
|
| 36 |
+
|
| 37 |
+
def test_find_files(self):
|
| 38 |
+
directory = 'results'
|
| 39 |
+
pattern = '*moral*.json'
|
| 40 |
+
files = self.processor._find_files(directory, pattern)
|
| 41 |
+
breakpoint()
|
| 42 |
+
print(files)
|
| 43 |
+
self.assertIsInstance(files, list)
|
| 44 |
+
|
| 45 |
|
| 46 |
if __name__ == '__main__':
|
| 47 |
unittest.main()
|