Spaces:

Guy24
/

inner_lexicon

Running

App Files Files Community

inner_lexicon / file_utils.py

Guy24

adding application

d844e87 7 months ago

raw

history blame contribute delete

3.56 kB

	import os
	import re
	import pandas as pd


	def save_df_to_dir(results_df, base_dir, sub_dirs, file_name_format, add_context, model_name):
	# Get the root directory of the project
	root_dir = os.path.dirname(os.path.abspath(__file__))

	# Construct the output directory path
	output_dir = os.path.join(root_dir, base_dir, *sub_dirs)
	os.makedirs(output_dir, exist_ok=True)

	# Construct the file name
	file_name = file_name_format.format(model_name=model_name,
	context="with_context" if add_context else "without_context")

	# Construct the full file path
	file_path = os.path.join(output_dir, file_name)

	# Save the DataFrame to CSV
	results_df.to_csv(file_path, index=False)


	def merge_dfs(base_dir, exp_name, part_format="part_{i}_", output_dir=None,
	filename="patchscopes_results.parquet", output_filename="patchscopes_results.parquet"):
	"""
	Merges DataFrames from directories matching the part format into a single DataFrame,
	and optionally saves the result to a file.

	Args:
	base_dir (str): The base directory containing the data.
	exp_name (str): The experiment name to look for within part directories.
	part_format (str): The general format for identifying parts (e.g., "part_{i}_").
	output_dir (str, optional): Directory to save the merged DataFrame. Default is None.
	filename (str): The filename of the Parquet file to read in each part directory.
	output_filename (str): Name of the output file if saving is enabled.

	Returns:
	pd.DataFrame: A single DataFrame containing data from all parts.
	"""
	dataframes = []
	part_regex = part_format.replace("{i}", r"\d+")

	# List all directories in base_dir
	for dir_name in os.listdir(base_dir):
	if os.path.isdir(os.path.join(base_dir, dir_name)) and re.match(part_regex, dir_name) and (dir_name.endswith(exp_name)):
	part_dir = os.path.join(base_dir, dir_name)
	file_path = os.path.join(part_dir, filename)

	if os.path.exists(file_path):
	# Read the DataFrame and add it to the list
	df = pd.read_parquet(file_path)
	dataframes.append(df)

	# Concatenate all DataFrames into a single DataFrame
	merged_df = pd.concat(dataframes, axis=1)

	# Save the result to file if output_dir is given
	if output_dir:
	os.makedirs(output_dir, exist_ok=True)
	output_path = os.path.join(output_dir, output_filename)
	merged_df.to_parquet(output_path, index=False)

	return merged_df, dataframes


	def parse_string_list_from_file(file_path, delimiter=None):
	"""
	Parses a list of strings from a file, handling various list formats.

	Args:
	file_path (str): Path to the file containing the list.

	Returns:
	list: A list of parsed strings.
	"""
	with open(file_path, 'r') as file:
	content = file.read()

	if delimiter is None:
	# Remove newlines and excess whitespace
	content = re.sub(r'\s+', ' ', content.strip())

	# Handle different delimiters and list formats
	# Removes common list notations like commas, brackets, quotes, etc.
	items = re.split(r'[,\[\]\(\)\{\}"\'\s]+', content)
	else:
	if delimiter == "newline": # TODO fix this
	delimiter = "\n"
	items = [item.strip() for item in content.split(delimiter)]

	# Filter out any empty strings from the list
	return [item for item in items if item]