A class to manage remote datasets through the Focoos API.
This class provides functionality to interact with datasets stored remotely,
including uploading, downloading, and managing dataset data.
Parameters:
Name |
Type |
Description |
Default |
ref
|
str
|
The reference identifier for the dataset.
|
required
|
api_client
|
ApiClient
|
The API client instance for making requests.
|
required
|
Attributes:
Name |
Type |
Description |
ref |
str
|
The dataset reference identifier.
|
api_client |
ApiClient
|
|
metadata |
DatasetPreview
|
|
Source code in focoos/remote_dataset.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148 | class RemoteDataset:
"""
A class to manage remote datasets through the Focoos API.
This class provides functionality to interact with datasets stored remotely,
including uploading, downloading, and managing dataset data.
Args:
ref (str): The reference identifier for the dataset.
api_client (ApiClient): The API client instance for making requests.
Attributes:
ref (str): The dataset reference identifier.
api_client (ApiClient): The API client instance.
metadata (DatasetPreview): The dataset metadata.
"""
def __init__(self, ref: str, api_client: ApiClient):
self.ref = ref
self.api_client = api_client
self.metadata: DatasetPreview = self.get_info()
def get_info(self) -> DatasetPreview:
"""
Retrieves the dataset information from the API.
Returns:
DatasetPreview: The dataset preview information.
"""
res = self.api_client.get(f"datasets/{self.ref}")
return DatasetPreview.from_json(res.json())
def upload_data(self, path: str) -> Optional[DatasetSpec]:
"""
Uploads dataset data from a local zip file to the remote storage.
Args:
path (str): Local path to the zip file containing dataset data.
Returns:
Optional[DatasetSpec]: The dataset specification after successful upload.
Raises:
FileNotFoundError: If the specified file does not exist.
ValueError: If the file is not a zip file or upload fails.
"""
if not path.endswith(".zip"):
raise ValueError("Dataset must be .zip compressed")
if not os.path.exists(path):
raise FileNotFoundError(f"File not found: {path}")
file_name = os.path.basename(path)
file_size = os.path.getsize(path)
file_size_mb = file_size / (1024 * 1024)
logger.info(f"🔗 Requesting upload url for {file_name} of size {file_size_mb:.2f} MB")
presigned_url = self.api_client.post(
f"datasets/{self.ref}/generate-upload-url",
data={"file_size_bytes": file_size, "file_name": file_name},
)
if presigned_url.status_code != 200:
raise ValueError(f"Failed to generate upload url: {presigned_url.status_code} {presigned_url.text}")
presigned_url = presigned_url.json()
fields = {k: v for k, v in presigned_url["fields"].items()}
logger.info(f"📤 Uploading file {file_name}..")
fields["file"] = (file_name, open(path, "rb"), "application/zip")
res = self.api_client.external_post(
presigned_url["url"],
files=fields,
data=presigned_url["fields"],
stream=True,
)
logger.info("✅ Upload file done.")
if res.status_code not in [200, 201, 204]:
raise ValueError(f"Failed to upload dataset: {res.status_code} {res.text}")
logger.info("🔗 Validating dataset..")
complete_upload = self.api_client.post(
f"datasets/{self.ref}/complete-upload",
)
if complete_upload.status_code not in [200, 201, 204]:
raise ValueError(f"Failed to validate dataset: {complete_upload.status_code} {complete_upload.text}")
self.metadata = self.get_info()
logger.info(f"✅ Dataset validated! => {self.metadata.spec}")
return self.metadata.spec
def download_data(self, path: str):
"""
Downloads the dataset data to a local path.
Args:
path (str): Local path where the dataset should be downloaded.
Returns:
str: The path where the file was downloaded.
Raises:
ValueError: If the download fails.
"""
res = self.api_client.get(f"datasets/{self.ref}/download")
if res.status_code != 200:
raise ValueError(f"Failed to download dataset data: {res.status_code} {res.text}")
logger.info(f"📥 Downloading dataset data to {path}")
url = res.json()["download_uri"]
path = self.api_client.download_file(url, path)
logger.info(f"✅ Dataset data downloaded to {path}")
return path
def delete(self):
"""
Deletes the entire dataset from the remote storage.
Raises:
Exception: If the deletion fails.
"""
try:
res = self.api_client.delete(f"datasets/{self.ref}")
res.raise_for_status()
logger.warning(f"Deleted dataset {self.ref}")
except Exception as e:
logger.error(f"Failed to delete dataset {self.ref}: {e}")
raise e
def delete_data(self):
"""
Deletes only the data content of the dataset while preserving metadata.
Updates the metadata after successful deletion.
"""
try:
res = self.api_client.delete(f"datasets/{self.ref}/data")
res.raise_for_status()
new_metadata = DatasetPreview.from_json(res.json())
self.metadata = new_metadata
logger.warning(f"Deleted dataset data {self.ref}")
except Exception as e:
logger.error(f"Failed to delete dataset data {self.ref}: {e}")
|
delete()
Deletes the entire dataset from the remote storage.
Raises:
Type |
Description |
Exception
|
|
Source code in focoos/remote_dataset.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132 | def delete(self):
"""
Deletes the entire dataset from the remote storage.
Raises:
Exception: If the deletion fails.
"""
try:
res = self.api_client.delete(f"datasets/{self.ref}")
res.raise_for_status()
logger.warning(f"Deleted dataset {self.ref}")
except Exception as e:
logger.error(f"Failed to delete dataset {self.ref}: {e}")
raise e
|
delete_data()
Deletes only the data content of the dataset while preserving metadata.
Updates the metadata after successful deletion.
Source code in focoos/remote_dataset.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148 | def delete_data(self):
"""
Deletes only the data content of the dataset while preserving metadata.
Updates the metadata after successful deletion.
"""
try:
res = self.api_client.delete(f"datasets/{self.ref}/data")
res.raise_for_status()
new_metadata = DatasetPreview.from_json(res.json())
self.metadata = new_metadata
logger.warning(f"Deleted dataset data {self.ref}")
except Exception as e:
logger.error(f"Failed to delete dataset data {self.ref}: {e}")
|
download_data(path)
Downloads the dataset data to a local path.
Parameters:
Name |
Type |
Description |
Default |
path
|
str
|
Local path where the dataset should be downloaded.
|
required
|
Returns:
Name | Type |
Description |
str |
|
The path where the file was downloaded.
|
Raises:
Type |
Description |
ValueError
|
|
Source code in focoos/remote_dataset.py
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117 | def download_data(self, path: str):
"""
Downloads the dataset data to a local path.
Args:
path (str): Local path where the dataset should be downloaded.
Returns:
str: The path where the file was downloaded.
Raises:
ValueError: If the download fails.
"""
res = self.api_client.get(f"datasets/{self.ref}/download")
if res.status_code != 200:
raise ValueError(f"Failed to download dataset data: {res.status_code} {res.text}")
logger.info(f"📥 Downloading dataset data to {path}")
url = res.json()["download_uri"]
path = self.api_client.download_file(url, path)
logger.info(f"✅ Dataset data downloaded to {path}")
return path
|
get_info()
Retrieves the dataset information from the API.
Returns:
Name | Type |
Description |
DatasetPreview |
DatasetPreview
|
The dataset preview information.
|
Source code in focoos/remote_dataset.py
33
34
35
36
37
38
39
40
41 | def get_info(self) -> DatasetPreview:
"""
Retrieves the dataset information from the API.
Returns:
DatasetPreview: The dataset preview information.
"""
res = self.api_client.get(f"datasets/{self.ref}")
return DatasetPreview.from_json(res.json())
|
upload_data(path)
Uploads dataset data from a local zip file to the remote storage.
Parameters:
Name |
Type |
Description |
Default |
path
|
str
|
Local path to the zip file containing dataset data.
|
required
|
Returns:
Type |
Description |
Optional[DatasetSpec]
|
Optional[DatasetSpec]: The dataset specification after successful upload.
|
Raises:
Type |
Description |
FileNotFoundError
|
If the specified file does not exist.
|
ValueError
|
If the file is not a zip file or upload fails.
|
Source code in focoos/remote_dataset.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95 | def upload_data(self, path: str) -> Optional[DatasetSpec]:
"""
Uploads dataset data from a local zip file to the remote storage.
Args:
path (str): Local path to the zip file containing dataset data.
Returns:
Optional[DatasetSpec]: The dataset specification after successful upload.
Raises:
FileNotFoundError: If the specified file does not exist.
ValueError: If the file is not a zip file or upload fails.
"""
if not path.endswith(".zip"):
raise ValueError("Dataset must be .zip compressed")
if not os.path.exists(path):
raise FileNotFoundError(f"File not found: {path}")
file_name = os.path.basename(path)
file_size = os.path.getsize(path)
file_size_mb = file_size / (1024 * 1024)
logger.info(f"🔗 Requesting upload url for {file_name} of size {file_size_mb:.2f} MB")
presigned_url = self.api_client.post(
f"datasets/{self.ref}/generate-upload-url",
data={"file_size_bytes": file_size, "file_name": file_name},
)
if presigned_url.status_code != 200:
raise ValueError(f"Failed to generate upload url: {presigned_url.status_code} {presigned_url.text}")
presigned_url = presigned_url.json()
fields = {k: v for k, v in presigned_url["fields"].items()}
logger.info(f"📤 Uploading file {file_name}..")
fields["file"] = (file_name, open(path, "rb"), "application/zip")
res = self.api_client.external_post(
presigned_url["url"],
files=fields,
data=presigned_url["fields"],
stream=True,
)
logger.info("✅ Upload file done.")
if res.status_code not in [200, 201, 204]:
raise ValueError(f"Failed to upload dataset: {res.status_code} {res.text}")
logger.info("🔗 Validating dataset..")
complete_upload = self.api_client.post(
f"datasets/{self.ref}/complete-upload",
)
if complete_upload.status_code not in [200, 201, 204]:
raise ValueError(f"Failed to validate dataset: {complete_upload.status_code} {complete_upload.text}")
self.metadata = self.get_info()
logger.info(f"✅ Dataset validated! => {self.metadata.spec}")
return self.metadata.spec
|