cpg_flow.targets.multicohort
This module defines classes for handling multi-cohort and cohort targets in a workflow. It includes functionality for managing datasets, sequencing groups, and generating PED files.
Classes: MultiCohort: Represents a multi-cohort target with multiple cohorts in the workflow. Cohort: Represents a cohort target with all sequencing groups from a single CustomCohort. Dataset: Represents a CPG dataset. Sex: Enum for representing sex as in PED format. SequencingGroup: Represents a sequencing group. PedigreeInfo: Represents pedigree relationships and other PED data.
Functions: seq_type_subdir: Returns a subdirectory parametrized by sequencing type.
1""" 2This module defines classes for handling multi-cohort and cohort targets in a workflow. 3It includes functionality for managing datasets, sequencing groups, and generating PED files. 4 5Classes: 6 MultiCohort: Represents a multi-cohort target with multiple cohorts in the workflow. 7 Cohort: Represents a cohort target with all sequencing groups from a single CustomCohort. 8 Dataset: Represents a CPG dataset. 9 Sex: Enum for representing sex as in PED format. 10 SequencingGroup: Represents a sequencing group. 11 PedigreeInfo: Represents pedigree relationships and other PED data. 12 13Functions: 14 seq_type_subdir: Returns a subdirectory parametrized by sequencing type. 15""" 16 17from typing import TYPE_CHECKING, Optional 18 19import pandas as pd 20 21from cpg_flow.targets import Cohort, Dataset, Target 22from cpg_flow.utils import get_logger 23from cpg_utils import Path 24from cpg_utils.config import get_config 25 26LOGGER = get_logger(__name__) 27 28if TYPE_CHECKING: 29 from cpg_flow.targets import SequencingGroup 30 31 32class MultiCohort(Target): 33 """ 34 Represents a "multi-cohort" target - multiple cohorts in the workflow. 35 """ 36 37 def __init__(self) -> None: 38 super().__init__() 39 40 # NOTE: For a cohort, we simply pull the dataset name from the config. 41 input_cohorts = get_config()['workflow'].get('input_cohorts', []) 42 if input_cohorts: 43 self.name = '_'.join(sorted(input_cohorts)) 44 else: 45 self.name = get_config()['workflow']['dataset'] 46 47 assert self.name, 'Ensure cohorts or dataset is defined in the config file.' 48 49 self._cohorts_by_id: dict[str, Cohort] = {} 50 self._datasets_by_name: dict[str, Dataset] = {} 51 self.analysis_dataset = Dataset(name=get_config()['workflow']['dataset']) 52 53 def __repr__(self): 54 return f'MultiCohort({len(self.get_cohorts())} cohorts)' 55 56 @property 57 def target_id(self) -> str: 58 """Unique target ID""" 59 return self.name 60 61 def create_dataset(self, name: str) -> 'Dataset': 62 """ 63 Create a dataset and add it to the cohort. 64 """ 65 if name in self._datasets_by_name: 66 return self._datasets_by_name[name] 67 68 if name == self.analysis_dataset.name: 69 ds = self.analysis_dataset 70 else: 71 ds = Dataset(name=name) 72 73 self._datasets_by_name[ds.name] = ds 74 return ds 75 76 def get_cohorts(self, only_active: bool = True) -> list['Cohort']: 77 """ 78 Gets list of all cohorts. 79 Include only "active" cohorts (unless only_active is False) 80 """ 81 cohorts = list(self._cohorts_by_id.values()) 82 if only_active: 83 cohorts = [c for c in cohorts if c.active] 84 return cohorts 85 86 def get_cohort_ids(self, only_active: bool = True) -> list['str']: 87 """ 88 Get list of cohort IDs. 89 Include only "active" cohorts (unless only_active is False) 90 """ 91 return [c.get_cohort_id() for c in self.get_cohorts(only_active)] 92 93 def get_cohort_by_id( 94 self, 95 id: str, 96 only_active: bool = True, 97 ) -> Optional['Cohort']: 98 """ 99 Get cohort by id. 100 Include only "active" cohorts (unless only_active is False) 101 """ 102 cohort = self._cohorts_by_id.get(id) 103 if not cohort: 104 LOGGER.warning(f'Cohort {id} not found in the multi-cohort') 105 106 if not only_active: # Return cohort even if it's inactive 107 return cohort 108 if isinstance(cohort, Cohort) and cohort.active: 109 return cohort 110 return None 111 112 def get_datasets(self, only_active: bool = True) -> list['Dataset']: 113 """ 114 Gets list of all datasets. 115 Include only "active" datasets (unless only_active is False) 116 """ 117 all_datasets = list(self._datasets_by_name.values()) 118 if only_active: 119 all_datasets = [d for d in all_datasets if d.active and d.get_sequencing_groups()] 120 return all_datasets 121 122 def get_sequencing_groups( 123 self, 124 only_active: bool = True, 125 ) -> list['SequencingGroup']: 126 """ 127 Gets a flat list of all sequencing groups from all datasets. 128 uses a dictionary to avoid duplicates (we could have the same sequencing group in multiple cohorts) 129 Include only "active" sequencing groups (unless only_active is False) 130 """ 131 all_sequencing_groups: dict[str, SequencingGroup] = {} 132 for dataset in self.get_datasets(only_active): 133 for sg in dataset.get_sequencing_groups(only_active): 134 all_sequencing_groups[sg.id] = sg 135 return list(all_sequencing_groups.values()) 136 137 def create_cohort(self, id: str, name: str) -> 'Cohort': 138 """ 139 Create a cohort and add it to the multi-cohort. 140 """ 141 if id in self._cohorts_by_id: 142 LOGGER.debug(f'Cohort {id} already exists in the multi-cohort') 143 return self._cohorts_by_id[id] 144 145 c = Cohort(id=id, name=name) 146 self._cohorts_by_id[c.id] = c 147 return c 148 149 def add_dataset(self, d: 'Dataset') -> 'Dataset': 150 """ 151 Add a Dataset to the MultiCohort 152 Args: 153 d: Dataset object 154 """ 155 if d.name in self._datasets_by_name: 156 LOGGER.debug( 157 f'Dataset {d.name} already exists in the MultiCohort {self.name}', 158 ) 159 else: 160 # We need create a new dataset to avoid manipulating the cohort dataset at this point 161 self._datasets_by_name[d.name] = Dataset(d.name) 162 return self._datasets_by_name[d.name] 163 164 def get_dataset_by_name( 165 self, 166 name: str, 167 only_active: bool = True, 168 ) -> Optional['Dataset']: 169 """ 170 Get dataset by name. 171 Include only "active" datasets (unless only_active is False) 172 """ 173 ds_by_name = {d.name: d for d in self.get_datasets(only_active)} 174 return ds_by_name.get(name) 175 176 def get_job_attrs(self) -> dict: 177 """ 178 Attributes for Hail Batch job. 179 """ 180 return { 181 # 'sequencing_groups': self.get_sequencing_group_ids(), 182 'datasets': [d.name for d in self.get_datasets()], 183 'cohorts': [c.id for c in self.get_cohorts()], 184 } 185 186 def write_ped_file( 187 self, 188 out_path: Path | None = None, 189 use_participant_id: bool = False, 190 ) -> Path: 191 """ 192 Create a PED file for all samples in the whole MultiCohort 193 Duplication of the Cohort method 194 PED is written with no header line to be strict specification compliant 195 """ 196 datas = [] 197 for sequencing_group in self.get_sequencing_groups(): 198 datas.append( 199 sequencing_group.pedigree.get_ped_dict( 200 use_participant_id=use_participant_id, 201 ), 202 ) 203 if not datas: 204 raise ValueError(f'No pedigree data found for {self.name}') 205 df = pd.DataFrame(datas) 206 207 if out_path is None: 208 out_path = self.analysis_dataset.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped' 209 210 if not get_config()['workflow'].get('dry_run', False): 211 with out_path.open('w') as fp: 212 df.to_csv(fp, sep='\t', index=False, header=False) 213 return out_path
33class MultiCohort(Target): 34 """ 35 Represents a "multi-cohort" target - multiple cohorts in the workflow. 36 """ 37 38 def __init__(self) -> None: 39 super().__init__() 40 41 # NOTE: For a cohort, we simply pull the dataset name from the config. 42 input_cohorts = get_config()['workflow'].get('input_cohorts', []) 43 if input_cohorts: 44 self.name = '_'.join(sorted(input_cohorts)) 45 else: 46 self.name = get_config()['workflow']['dataset'] 47 48 assert self.name, 'Ensure cohorts or dataset is defined in the config file.' 49 50 self._cohorts_by_id: dict[str, Cohort] = {} 51 self._datasets_by_name: dict[str, Dataset] = {} 52 self.analysis_dataset = Dataset(name=get_config()['workflow']['dataset']) 53 54 def __repr__(self): 55 return f'MultiCohort({len(self.get_cohorts())} cohorts)' 56 57 @property 58 def target_id(self) -> str: 59 """Unique target ID""" 60 return self.name 61 62 def create_dataset(self, name: str) -> 'Dataset': 63 """ 64 Create a dataset and add it to the cohort. 65 """ 66 if name in self._datasets_by_name: 67 return self._datasets_by_name[name] 68 69 if name == self.analysis_dataset.name: 70 ds = self.analysis_dataset 71 else: 72 ds = Dataset(name=name) 73 74 self._datasets_by_name[ds.name] = ds 75 return ds 76 77 def get_cohorts(self, only_active: bool = True) -> list['Cohort']: 78 """ 79 Gets list of all cohorts. 80 Include only "active" cohorts (unless only_active is False) 81 """ 82 cohorts = list(self._cohorts_by_id.values()) 83 if only_active: 84 cohorts = [c for c in cohorts if c.active] 85 return cohorts 86 87 def get_cohort_ids(self, only_active: bool = True) -> list['str']: 88 """ 89 Get list of cohort IDs. 90 Include only "active" cohorts (unless only_active is False) 91 """ 92 return [c.get_cohort_id() for c in self.get_cohorts(only_active)] 93 94 def get_cohort_by_id( 95 self, 96 id: str, 97 only_active: bool = True, 98 ) -> Optional['Cohort']: 99 """ 100 Get cohort by id. 101 Include only "active" cohorts (unless only_active is False) 102 """ 103 cohort = self._cohorts_by_id.get(id) 104 if not cohort: 105 LOGGER.warning(f'Cohort {id} not found in the multi-cohort') 106 107 if not only_active: # Return cohort even if it's inactive 108 return cohort 109 if isinstance(cohort, Cohort) and cohort.active: 110 return cohort 111 return None 112 113 def get_datasets(self, only_active: bool = True) -> list['Dataset']: 114 """ 115 Gets list of all datasets. 116 Include only "active" datasets (unless only_active is False) 117 """ 118 all_datasets = list(self._datasets_by_name.values()) 119 if only_active: 120 all_datasets = [d for d in all_datasets if d.active and d.get_sequencing_groups()] 121 return all_datasets 122 123 def get_sequencing_groups( 124 self, 125 only_active: bool = True, 126 ) -> list['SequencingGroup']: 127 """ 128 Gets a flat list of all sequencing groups from all datasets. 129 uses a dictionary to avoid duplicates (we could have the same sequencing group in multiple cohorts) 130 Include only "active" sequencing groups (unless only_active is False) 131 """ 132 all_sequencing_groups: dict[str, SequencingGroup] = {} 133 for dataset in self.get_datasets(only_active): 134 for sg in dataset.get_sequencing_groups(only_active): 135 all_sequencing_groups[sg.id] = sg 136 return list(all_sequencing_groups.values()) 137 138 def create_cohort(self, id: str, name: str) -> 'Cohort': 139 """ 140 Create a cohort and add it to the multi-cohort. 141 """ 142 if id in self._cohorts_by_id: 143 LOGGER.debug(f'Cohort {id} already exists in the multi-cohort') 144 return self._cohorts_by_id[id] 145 146 c = Cohort(id=id, name=name) 147 self._cohorts_by_id[c.id] = c 148 return c 149 150 def add_dataset(self, d: 'Dataset') -> 'Dataset': 151 """ 152 Add a Dataset to the MultiCohort 153 Args: 154 d: Dataset object 155 """ 156 if d.name in self._datasets_by_name: 157 LOGGER.debug( 158 f'Dataset {d.name} already exists in the MultiCohort {self.name}', 159 ) 160 else: 161 # We need create a new dataset to avoid manipulating the cohort dataset at this point 162 self._datasets_by_name[d.name] = Dataset(d.name) 163 return self._datasets_by_name[d.name] 164 165 def get_dataset_by_name( 166 self, 167 name: str, 168 only_active: bool = True, 169 ) -> Optional['Dataset']: 170 """ 171 Get dataset by name. 172 Include only "active" datasets (unless only_active is False) 173 """ 174 ds_by_name = {d.name: d for d in self.get_datasets(only_active)} 175 return ds_by_name.get(name) 176 177 def get_job_attrs(self) -> dict: 178 """ 179 Attributes for Hail Batch job. 180 """ 181 return { 182 # 'sequencing_groups': self.get_sequencing_group_ids(), 183 'datasets': [d.name for d in self.get_datasets()], 184 'cohorts': [c.id for c in self.get_cohorts()], 185 } 186 187 def write_ped_file( 188 self, 189 out_path: Path | None = None, 190 use_participant_id: bool = False, 191 ) -> Path: 192 """ 193 Create a PED file for all samples in the whole MultiCohort 194 Duplication of the Cohort method 195 PED is written with no header line to be strict specification compliant 196 """ 197 datas = [] 198 for sequencing_group in self.get_sequencing_groups(): 199 datas.append( 200 sequencing_group.pedigree.get_ped_dict( 201 use_participant_id=use_participant_id, 202 ), 203 ) 204 if not datas: 205 raise ValueError(f'No pedigree data found for {self.name}') 206 df = pd.DataFrame(datas) 207 208 if out_path is None: 209 out_path = self.analysis_dataset.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped' 210 211 if not get_config()['workflow'].get('dry_run', False): 212 with out_path.open('w') as fp: 213 df.to_csv(fp, sep='\t', index=False, header=False) 214 return out_path
Represents a "multi-cohort" target - multiple cohorts in the workflow.
62 def create_dataset(self, name: str) -> 'Dataset': 63 """ 64 Create a dataset and add it to the cohort. 65 """ 66 if name in self._datasets_by_name: 67 return self._datasets_by_name[name] 68 69 if name == self.analysis_dataset.name: 70 ds = self.analysis_dataset 71 else: 72 ds = Dataset(name=name) 73 74 self._datasets_by_name[ds.name] = ds 75 return ds
Create a dataset and add it to the cohort.
77 def get_cohorts(self, only_active: bool = True) -> list['Cohort']: 78 """ 79 Gets list of all cohorts. 80 Include only "active" cohorts (unless only_active is False) 81 """ 82 cohorts = list(self._cohorts_by_id.values()) 83 if only_active: 84 cohorts = [c for c in cohorts if c.active] 85 return cohorts
Gets list of all cohorts. Include only "active" cohorts (unless only_active is False)
87 def get_cohort_ids(self, only_active: bool = True) -> list['str']: 88 """ 89 Get list of cohort IDs. 90 Include only "active" cohorts (unless only_active is False) 91 """ 92 return [c.get_cohort_id() for c in self.get_cohorts(only_active)]
Get list of cohort IDs. Include only "active" cohorts (unless only_active is False)
94 def get_cohort_by_id( 95 self, 96 id: str, 97 only_active: bool = True, 98 ) -> Optional['Cohort']: 99 """ 100 Get cohort by id. 101 Include only "active" cohorts (unless only_active is False) 102 """ 103 cohort = self._cohorts_by_id.get(id) 104 if not cohort: 105 LOGGER.warning(f'Cohort {id} not found in the multi-cohort') 106 107 if not only_active: # Return cohort even if it's inactive 108 return cohort 109 if isinstance(cohort, Cohort) and cohort.active: 110 return cohort 111 return None
Get cohort by id. Include only "active" cohorts (unless only_active is False)
113 def get_datasets(self, only_active: bool = True) -> list['Dataset']: 114 """ 115 Gets list of all datasets. 116 Include only "active" datasets (unless only_active is False) 117 """ 118 all_datasets = list(self._datasets_by_name.values()) 119 if only_active: 120 all_datasets = [d for d in all_datasets if d.active and d.get_sequencing_groups()] 121 return all_datasets
Gets list of all datasets. Include only "active" datasets (unless only_active is False)
123 def get_sequencing_groups( 124 self, 125 only_active: bool = True, 126 ) -> list['SequencingGroup']: 127 """ 128 Gets a flat list of all sequencing groups from all datasets. 129 uses a dictionary to avoid duplicates (we could have the same sequencing group in multiple cohorts) 130 Include only "active" sequencing groups (unless only_active is False) 131 """ 132 all_sequencing_groups: dict[str, SequencingGroup] = {} 133 for dataset in self.get_datasets(only_active): 134 for sg in dataset.get_sequencing_groups(only_active): 135 all_sequencing_groups[sg.id] = sg 136 return list(all_sequencing_groups.values())
Gets a flat list of all sequencing groups from all datasets. uses a dictionary to avoid duplicates (we could have the same sequencing group in multiple cohorts) Include only "active" sequencing groups (unless only_active is False)
138 def create_cohort(self, id: str, name: str) -> 'Cohort': 139 """ 140 Create a cohort and add it to the multi-cohort. 141 """ 142 if id in self._cohorts_by_id: 143 LOGGER.debug(f'Cohort {id} already exists in the multi-cohort') 144 return self._cohorts_by_id[id] 145 146 c = Cohort(id=id, name=name) 147 self._cohorts_by_id[c.id] = c 148 return c
Create a cohort and add it to the multi-cohort.
150 def add_dataset(self, d: 'Dataset') -> 'Dataset': 151 """ 152 Add a Dataset to the MultiCohort 153 Args: 154 d: Dataset object 155 """ 156 if d.name in self._datasets_by_name: 157 LOGGER.debug( 158 f'Dataset {d.name} already exists in the MultiCohort {self.name}', 159 ) 160 else: 161 # We need create a new dataset to avoid manipulating the cohort dataset at this point 162 self._datasets_by_name[d.name] = Dataset(d.name) 163 return self._datasets_by_name[d.name]
Add a Dataset to the MultiCohort Args: d: Dataset object
165 def get_dataset_by_name( 166 self, 167 name: str, 168 only_active: bool = True, 169 ) -> Optional['Dataset']: 170 """ 171 Get dataset by name. 172 Include only "active" datasets (unless only_active is False) 173 """ 174 ds_by_name = {d.name: d for d in self.get_datasets(only_active)} 175 return ds_by_name.get(name)
Get dataset by name. Include only "active" datasets (unless only_active is False)
177 def get_job_attrs(self) -> dict: 178 """ 179 Attributes for Hail Batch job. 180 """ 181 return { 182 # 'sequencing_groups': self.get_sequencing_group_ids(), 183 'datasets': [d.name for d in self.get_datasets()], 184 'cohorts': [c.id for c in self.get_cohorts()], 185 }
Attributes for Hail Batch job.
187 def write_ped_file( 188 self, 189 out_path: Path | None = None, 190 use_participant_id: bool = False, 191 ) -> Path: 192 """ 193 Create a PED file for all samples in the whole MultiCohort 194 Duplication of the Cohort method 195 PED is written with no header line to be strict specification compliant 196 """ 197 datas = [] 198 for sequencing_group in self.get_sequencing_groups(): 199 datas.append( 200 sequencing_group.pedigree.get_ped_dict( 201 use_participant_id=use_participant_id, 202 ), 203 ) 204 if not datas: 205 raise ValueError(f'No pedigree data found for {self.name}') 206 df = pd.DataFrame(datas) 207 208 if out_path is None: 209 out_path = self.analysis_dataset.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped' 210 211 if not get_config()['workflow'].get('dry_run', False): 212 with out_path.open('w') as fp: 213 df.to_csv(fp, sep='\t', index=False, header=False) 214 return out_path
Create a PED file for all samples in the whole MultiCohort Duplication of the Cohort method PED is written with no header line to be strict specification compliant