cpg_flow.targets.dataset
This module defines the Dataset
class, which is part of the cpg-flow system for managing genomic datasets.
The Dataset
class allows for:
- Managing sequencing groups: Creation, addition, and retrieval of sequencing groups that are part of the dataset.
- Storage path management: Provides methods to access primary, temporary, analysis, and web storage paths.
- Integration with configurations: Uses configuration settings for workflow management and path handling.
- Pedigree file generation: Capable of generating PED files based on sequencing group data for genetic analysis.
- Logging: Utilizes LOGGER to track and debug sequencing group additions.
Key Components:
- Dataset: Main class that represents a genomic dataset and extends from the
Target
class. - SequencingGroup Management: Methods to handle sequencing groups related to the dataset.
- Path Handling: Methods to derive and handle various storage paths.
- Configurations: Integration with external configuration settings for flexible dataset handling.
This module is essential for organizing and managing data resources in CPG-related projects.
1""" 2This module defines the `Dataset` class, which is part of the cpg-flow system for managing genomic datasets. 3 4The `Dataset` class allows for: 5- Managing sequencing groups: Creation, addition, and retrieval of sequencing groups that are part of the dataset. 6- Storage path management: Provides methods to access primary, temporary, analysis, and web storage paths. 7- Integration with configurations: Uses configuration settings for workflow management and path handling. 8- Pedigree file generation: Capable of generating PED files based on sequencing group data for genetic analysis. 9- Logging: Utilizes LOGGER to track and debug sequencing group additions. 10 11Key Components: 12- Dataset: Main class that represents a genomic dataset and extends from the `Target` class. 13- SequencingGroup Management: Methods to handle sequencing groups related to the dataset. 14- Path Handling: Methods to derive and handle various storage paths. 15- Configurations: Integration with external configuration settings for flexible dataset handling. 16 17This module is essential for organizing and managing data resources in CPG-related projects. 18 19""" 20 21from typing import TYPE_CHECKING, Optional 22 23import pandas as pd 24 25from cpg_flow.filetypes import AlignmentInput 26from cpg_flow.targets import SequencingGroup, Target, seq_type_subdir 27from cpg_flow.utils import get_logger 28from cpg_utils import Path, to_path 29from cpg_utils.config import dataset_path, get_config, web_url 30 31LOGGER = get_logger(__name__) 32 33if TYPE_CHECKING: 34 from cpg_flow.targets import PedigreeInfo, Sex 35 36 37class Dataset(Target): 38 """ 39 Represents a CPG dataset. 40 41 Each `dataset` at the CPG corresponds to 42 * a GCP project: https://github.com/populationgenomics/team-docs/tree/main/storage_policies 43 * a Pulumi stack: https://github.com/populationgenomics/analysis-runner/tree/main/stack 44 * a metamist project 45 """ 46 47 def __init__( 48 self, 49 name: str, 50 ): 51 super().__init__() 52 self._sequencing_group_by_id: dict[str, SequencingGroup] = {} 53 self.name = name 54 self.active = True 55 56 @staticmethod 57 def create(name: str) -> 'Dataset': 58 """ 59 Create a dataset. 60 """ 61 return Dataset(name=name) 62 63 @property 64 def target_id(self) -> str: 65 """Unique target ID""" 66 return self.name 67 68 def __repr__(self): 69 return f'Dataset("{self.name}", {len(self.get_sequencing_groups())} sequencing groups)' 70 71 def __str__(self): 72 return f'{self.name} ({len(self.get_sequencing_groups())} sequencing groups)' 73 74 def prefix(self, **kwargs) -> Path: 75 """ 76 The primary storage path. 77 """ 78 return to_path( 79 dataset_path( 80 seq_type_subdir(), 81 dataset=self.name, 82 **kwargs, 83 ), 84 ) 85 86 def tmp_prefix(self, **kwargs) -> Path: 87 """ 88 Storage path for temporary files. 89 """ 90 return to_path( 91 dataset_path( 92 seq_type_subdir(), 93 dataset=self.name, 94 category='tmp', 95 **kwargs, 96 ), 97 ) 98 99 def analysis_prefix(self, **kwargs) -> Path: 100 """ 101 Storage path for analysis files. 102 """ 103 return to_path( 104 dataset_path( 105 seq_type_subdir(), 106 dataset=self.name, 107 category='analysis', 108 **kwargs, 109 ), 110 ) 111 112 def web_prefix(self, **kwargs) -> Path: 113 """ 114 Path for files served by an HTTP server Matches corresponding URLs returns by 115 self.web_url() URLs. 116 """ 117 return to_path( 118 dataset_path( 119 seq_type_subdir(), 120 dataset=self.name, 121 category='web', 122 **kwargs, 123 ), 124 ) 125 126 def web_url(self) -> str | None: 127 """ 128 URLs matching self.storage_web_path() files serverd by an HTTP server. 129 """ 130 return web_url( 131 seq_type_subdir(), 132 dataset=self.name, 133 ) 134 135 def add_sequencing_group( 136 self, 137 id: str, # pylint: disable=redefined-builtin 138 *, 139 sequencing_type: str, 140 sequencing_technology: str, 141 sequencing_platform: str, 142 external_id: str | None = None, 143 participant_id: str | None = None, 144 meta: dict | None = None, 145 sex: Optional['Sex'] = None, 146 pedigree: Optional['PedigreeInfo'] = None, 147 alignment_input: AlignmentInput | None = None, 148 ) -> 'SequencingGroup': 149 """ 150 Create a new sequencing group and add it to the dataset. 151 """ 152 if id in self._sequencing_group_by_id: 153 LOGGER.debug( 154 f'SequencingGroup {id} already exists in the dataset {self.name}', 155 ) 156 return self._sequencing_group_by_id[id] 157 158 force_sgs = get_config()['workflow'].get('force_sgs', set()) 159 forced = id in force_sgs or external_id in force_sgs or participant_id in force_sgs 160 161 s = SequencingGroup( 162 id=id, 163 dataset=self, 164 external_id=external_id, 165 sequencing_type=sequencing_type, 166 sequencing_technology=sequencing_technology, 167 sequencing_platform=sequencing_platform, 168 participant_id=participant_id, 169 meta=meta, 170 sex=sex, 171 pedigree=pedigree, 172 alignment_input=alignment_input, 173 forced=forced, 174 ) 175 self._sequencing_group_by_id[id] = s 176 return s 177 178 def add_sequencing_group_object(self, s: 'SequencingGroup'): 179 """ 180 Add a sequencing group object to the dataset. 181 Args: 182 s: SequencingGroup object 183 """ 184 if s.id in self._sequencing_group_by_id: 185 LOGGER.debug( 186 f'SequencingGroup {s.id} already exists in the dataset {self.name}', 187 ) 188 else: 189 self._sequencing_group_by_id[s.id] = s 190 191 def get_sequencing_group_by_id(self, id: str) -> Optional['SequencingGroup']: 192 """ 193 Get sequencing group by ID 194 """ 195 return self._sequencing_group_by_id.get(id) 196 197 def get_sequencing_groups( 198 self, 199 only_active: bool = True, 200 ) -> list['SequencingGroup']: 201 """ 202 Get dataset's sequencing groups. Include only "active" sequencing groups, unless only_active=False 203 """ 204 return [s for sid, s in self._sequencing_group_by_id.items() if (s.active or not only_active)] 205 206 def get_job_attrs(self) -> dict: 207 """ 208 Attributes for Hail Batch job. 209 """ 210 return { 211 'dataset': self.name, 212 # 'sequencing_groups': self.get_sequencing_group_ids(), 213 } 214 215 def get_job_prefix(self) -> str: 216 """ 217 Prefix job names. 218 """ 219 return f'{self.name}: ' 220 221 def write_ped_file( 222 self, 223 out_path: Path | None = None, 224 use_participant_id: bool = False, 225 ) -> Path: 226 """ 227 Create a PED file for all sequencing groups 228 PED is written with no header line to be strict specification compliant 229 """ 230 datas = [] 231 for sequencing_group in self.get_sequencing_groups(): 232 datas.append( 233 sequencing_group.pedigree.get_ped_dict( 234 use_participant_id=use_participant_id, 235 ), 236 ) 237 if not datas: 238 raise ValueError(f'No pedigree data found for {self.name}') 239 df = pd.DataFrame(datas) 240 241 if out_path is None: 242 out_path = self.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped' 243 244 if not get_config()['workflow'].get('dry_run', False): 245 with out_path.open('w') as fp: 246 df.to_csv(fp, sep='\t', index=False, header=False) 247 return out_path
38class Dataset(Target): 39 """ 40 Represents a CPG dataset. 41 42 Each `dataset` at the CPG corresponds to 43 * a GCP project: https://github.com/populationgenomics/team-docs/tree/main/storage_policies 44 * a Pulumi stack: https://github.com/populationgenomics/analysis-runner/tree/main/stack 45 * a metamist project 46 """ 47 48 def __init__( 49 self, 50 name: str, 51 ): 52 super().__init__() 53 self._sequencing_group_by_id: dict[str, SequencingGroup] = {} 54 self.name = name 55 self.active = True 56 57 @staticmethod 58 def create(name: str) -> 'Dataset': 59 """ 60 Create a dataset. 61 """ 62 return Dataset(name=name) 63 64 @property 65 def target_id(self) -> str: 66 """Unique target ID""" 67 return self.name 68 69 def __repr__(self): 70 return f'Dataset("{self.name}", {len(self.get_sequencing_groups())} sequencing groups)' 71 72 def __str__(self): 73 return f'{self.name} ({len(self.get_sequencing_groups())} sequencing groups)' 74 75 def prefix(self, **kwargs) -> Path: 76 """ 77 The primary storage path. 78 """ 79 return to_path( 80 dataset_path( 81 seq_type_subdir(), 82 dataset=self.name, 83 **kwargs, 84 ), 85 ) 86 87 def tmp_prefix(self, **kwargs) -> Path: 88 """ 89 Storage path for temporary files. 90 """ 91 return to_path( 92 dataset_path( 93 seq_type_subdir(), 94 dataset=self.name, 95 category='tmp', 96 **kwargs, 97 ), 98 ) 99 100 def analysis_prefix(self, **kwargs) -> Path: 101 """ 102 Storage path for analysis files. 103 """ 104 return to_path( 105 dataset_path( 106 seq_type_subdir(), 107 dataset=self.name, 108 category='analysis', 109 **kwargs, 110 ), 111 ) 112 113 def web_prefix(self, **kwargs) -> Path: 114 """ 115 Path for files served by an HTTP server Matches corresponding URLs returns by 116 self.web_url() URLs. 117 """ 118 return to_path( 119 dataset_path( 120 seq_type_subdir(), 121 dataset=self.name, 122 category='web', 123 **kwargs, 124 ), 125 ) 126 127 def web_url(self) -> str | None: 128 """ 129 URLs matching self.storage_web_path() files serverd by an HTTP server. 130 """ 131 return web_url( 132 seq_type_subdir(), 133 dataset=self.name, 134 ) 135 136 def add_sequencing_group( 137 self, 138 id: str, # pylint: disable=redefined-builtin 139 *, 140 sequencing_type: str, 141 sequencing_technology: str, 142 sequencing_platform: str, 143 external_id: str | None = None, 144 participant_id: str | None = None, 145 meta: dict | None = None, 146 sex: Optional['Sex'] = None, 147 pedigree: Optional['PedigreeInfo'] = None, 148 alignment_input: AlignmentInput | None = None, 149 ) -> 'SequencingGroup': 150 """ 151 Create a new sequencing group and add it to the dataset. 152 """ 153 if id in self._sequencing_group_by_id: 154 LOGGER.debug( 155 f'SequencingGroup {id} already exists in the dataset {self.name}', 156 ) 157 return self._sequencing_group_by_id[id] 158 159 force_sgs = get_config()['workflow'].get('force_sgs', set()) 160 forced = id in force_sgs or external_id in force_sgs or participant_id in force_sgs 161 162 s = SequencingGroup( 163 id=id, 164 dataset=self, 165 external_id=external_id, 166 sequencing_type=sequencing_type, 167 sequencing_technology=sequencing_technology, 168 sequencing_platform=sequencing_platform, 169 participant_id=participant_id, 170 meta=meta, 171 sex=sex, 172 pedigree=pedigree, 173 alignment_input=alignment_input, 174 forced=forced, 175 ) 176 self._sequencing_group_by_id[id] = s 177 return s 178 179 def add_sequencing_group_object(self, s: 'SequencingGroup'): 180 """ 181 Add a sequencing group object to the dataset. 182 Args: 183 s: SequencingGroup object 184 """ 185 if s.id in self._sequencing_group_by_id: 186 LOGGER.debug( 187 f'SequencingGroup {s.id} already exists in the dataset {self.name}', 188 ) 189 else: 190 self._sequencing_group_by_id[s.id] = s 191 192 def get_sequencing_group_by_id(self, id: str) -> Optional['SequencingGroup']: 193 """ 194 Get sequencing group by ID 195 """ 196 return self._sequencing_group_by_id.get(id) 197 198 def get_sequencing_groups( 199 self, 200 only_active: bool = True, 201 ) -> list['SequencingGroup']: 202 """ 203 Get dataset's sequencing groups. Include only "active" sequencing groups, unless only_active=False 204 """ 205 return [s for sid, s in self._sequencing_group_by_id.items() if (s.active or not only_active)] 206 207 def get_job_attrs(self) -> dict: 208 """ 209 Attributes for Hail Batch job. 210 """ 211 return { 212 'dataset': self.name, 213 # 'sequencing_groups': self.get_sequencing_group_ids(), 214 } 215 216 def get_job_prefix(self) -> str: 217 """ 218 Prefix job names. 219 """ 220 return f'{self.name}: ' 221 222 def write_ped_file( 223 self, 224 out_path: Path | None = None, 225 use_participant_id: bool = False, 226 ) -> Path: 227 """ 228 Create a PED file for all sequencing groups 229 PED is written with no header line to be strict specification compliant 230 """ 231 datas = [] 232 for sequencing_group in self.get_sequencing_groups(): 233 datas.append( 234 sequencing_group.pedigree.get_ped_dict( 235 use_participant_id=use_participant_id, 236 ), 237 ) 238 if not datas: 239 raise ValueError(f'No pedigree data found for {self.name}') 240 df = pd.DataFrame(datas) 241 242 if out_path is None: 243 out_path = self.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped' 244 245 if not get_config()['workflow'].get('dry_run', False): 246 with out_path.open('w') as fp: 247 df.to_csv(fp, sep='\t', index=False, header=False) 248 return out_path
Represents a CPG dataset.
Each dataset
at the CPG corresponds to
- a GCP project: https://github.com/populationgenomics/team-docs/tree/main/storage_policies
- a Pulumi stack: https://github.com/populationgenomics/analysis-runner/tree/main/stack
- a metamist project
57 @staticmethod 58 def create(name: str) -> 'Dataset': 59 """ 60 Create a dataset. 61 """ 62 return Dataset(name=name)
Create a dataset.
75 def prefix(self, **kwargs) -> Path: 76 """ 77 The primary storage path. 78 """ 79 return to_path( 80 dataset_path( 81 seq_type_subdir(), 82 dataset=self.name, 83 **kwargs, 84 ), 85 )
The primary storage path.
87 def tmp_prefix(self, **kwargs) -> Path: 88 """ 89 Storage path for temporary files. 90 """ 91 return to_path( 92 dataset_path( 93 seq_type_subdir(), 94 dataset=self.name, 95 category='tmp', 96 **kwargs, 97 ), 98 )
Storage path for temporary files.
100 def analysis_prefix(self, **kwargs) -> Path: 101 """ 102 Storage path for analysis files. 103 """ 104 return to_path( 105 dataset_path( 106 seq_type_subdir(), 107 dataset=self.name, 108 category='analysis', 109 **kwargs, 110 ), 111 )
Storage path for analysis files.
113 def web_prefix(self, **kwargs) -> Path: 114 """ 115 Path for files served by an HTTP server Matches corresponding URLs returns by 116 self.web_url() URLs. 117 """ 118 return to_path( 119 dataset_path( 120 seq_type_subdir(), 121 dataset=self.name, 122 category='web', 123 **kwargs, 124 ), 125 )
Path for files served by an HTTP server Matches corresponding URLs returns by self.web_url() URLs.
127 def web_url(self) -> str | None: 128 """ 129 URLs matching self.storage_web_path() files serverd by an HTTP server. 130 """ 131 return web_url( 132 seq_type_subdir(), 133 dataset=self.name, 134 )
URLs matching self.storage_web_path() files serverd by an HTTP server.
136 def add_sequencing_group( 137 self, 138 id: str, # pylint: disable=redefined-builtin 139 *, 140 sequencing_type: str, 141 sequencing_technology: str, 142 sequencing_platform: str, 143 external_id: str | None = None, 144 participant_id: str | None = None, 145 meta: dict | None = None, 146 sex: Optional['Sex'] = None, 147 pedigree: Optional['PedigreeInfo'] = None, 148 alignment_input: AlignmentInput | None = None, 149 ) -> 'SequencingGroup': 150 """ 151 Create a new sequencing group and add it to the dataset. 152 """ 153 if id in self._sequencing_group_by_id: 154 LOGGER.debug( 155 f'SequencingGroup {id} already exists in the dataset {self.name}', 156 ) 157 return self._sequencing_group_by_id[id] 158 159 force_sgs = get_config()['workflow'].get('force_sgs', set()) 160 forced = id in force_sgs or external_id in force_sgs or participant_id in force_sgs 161 162 s = SequencingGroup( 163 id=id, 164 dataset=self, 165 external_id=external_id, 166 sequencing_type=sequencing_type, 167 sequencing_technology=sequencing_technology, 168 sequencing_platform=sequencing_platform, 169 participant_id=participant_id, 170 meta=meta, 171 sex=sex, 172 pedigree=pedigree, 173 alignment_input=alignment_input, 174 forced=forced, 175 ) 176 self._sequencing_group_by_id[id] = s 177 return s
Create a new sequencing group and add it to the dataset.
179 def add_sequencing_group_object(self, s: 'SequencingGroup'): 180 """ 181 Add a sequencing group object to the dataset. 182 Args: 183 s: SequencingGroup object 184 """ 185 if s.id in self._sequencing_group_by_id: 186 LOGGER.debug( 187 f'SequencingGroup {s.id} already exists in the dataset {self.name}', 188 ) 189 else: 190 self._sequencing_group_by_id[s.id] = s
Add a sequencing group object to the dataset. Args: s: SequencingGroup object
192 def get_sequencing_group_by_id(self, id: str) -> Optional['SequencingGroup']: 193 """ 194 Get sequencing group by ID 195 """ 196 return self._sequencing_group_by_id.get(id)
Get sequencing group by ID
198 def get_sequencing_groups( 199 self, 200 only_active: bool = True, 201 ) -> list['SequencingGroup']: 202 """ 203 Get dataset's sequencing groups. Include only "active" sequencing groups, unless only_active=False 204 """ 205 return [s for sid, s in self._sequencing_group_by_id.items() if (s.active or not only_active)]
Get dataset's sequencing groups. Include only "active" sequencing groups, unless only_active=False
207 def get_job_attrs(self) -> dict: 208 """ 209 Attributes for Hail Batch job. 210 """ 211 return { 212 'dataset': self.name, 213 # 'sequencing_groups': self.get_sequencing_group_ids(), 214 }
Attributes for Hail Batch job.
216 def get_job_prefix(self) -> str: 217 """ 218 Prefix job names. 219 """ 220 return f'{self.name}: '
Prefix job names.
222 def write_ped_file( 223 self, 224 out_path: Path | None = None, 225 use_participant_id: bool = False, 226 ) -> Path: 227 """ 228 Create a PED file for all sequencing groups 229 PED is written with no header line to be strict specification compliant 230 """ 231 datas = [] 232 for sequencing_group in self.get_sequencing_groups(): 233 datas.append( 234 sequencing_group.pedigree.get_ped_dict( 235 use_participant_id=use_participant_id, 236 ), 237 ) 238 if not datas: 239 raise ValueError(f'No pedigree data found for {self.name}') 240 df = pd.DataFrame(datas) 241 242 if out_path is None: 243 out_path = self.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped' 244 245 if not get_config()['workflow'].get('dry_run', False): 246 with out_path.open('w') as fp: 247 df.to_csv(fp, sep='\t', index=False, header=False) 248 return out_path
Create a PED file for all sequencing groups PED is written with no header line to be strict specification compliant