cpg_flow.targets.cohort
This module defines the Cohort
class, which represents a cohort target in the workflow.
A cohort consists of all sequencing groups from a single CustomCohort, potentially spanning multiple datasets.
Classes: Cohort: Represents a cohort target in the workflow.
Usage:
The Cohort
class is used to manage and analyze sequencing groups within a cohort.
It provides methods to add sequencing groups, retrieve sequencing groups, write PED files,
and export cohort data to TSV files.
Example: cohort = Cohort(name="example_cohort") cohort.add_sequencing_group_object(sequencing_group) ped_file_path = cohort.write_ped_file() tsv_file_path = cohort.to_tsv()
1""" 2This module defines the `Cohort` class, which represents a cohort target in the workflow. 3A cohort consists of all sequencing groups from a single CustomCohort, potentially spanning multiple datasets. 4 5Classes: 6 Cohort: Represents a cohort target in the workflow. 7 8Usage: 9 The `Cohort` class is used to manage and analyze sequencing groups within a cohort. 10 It provides methods to add sequencing groups, retrieve sequencing groups, write PED files, 11 and export cohort data to TSV files. 12 13Example: 14 cohort = Cohort(name="example_cohort") 15 cohort.add_sequencing_group_object(sequencing_group) 16 ped_file_path = cohort.write_ped_file() 17 tsv_file_path = cohort.to_tsv() 18 19""" 20 21from typing import TYPE_CHECKING 22 23import pandas as pd 24 25from cpg_flow.targets import Dataset, Target 26from cpg_flow.utils import get_logger 27from cpg_utils import Path, to_path 28from cpg_utils.config import get_config 29 30LOGGER = get_logger(__name__) 31 32if TYPE_CHECKING: 33 from cpg_flow.targets import SequencingGroup 34 35 36class Cohort(Target): 37 """ 38 Represents a "cohort" target - all sequencing groups from a single CustomCohort (potentially spanning multiple datasets) in the workflow. 39 Analysis dataset name is required and will be used as the default name for the 40 cohort. 41 """ 42 43 def __init__(self, name: str | None = None) -> None: 44 super().__init__() 45 self.name = name or get_config()['workflow']['dataset'] 46 self.analysis_dataset = Dataset(name=get_config()['workflow']['dataset']) 47 self._sequencing_group_by_id: dict[str, SequencingGroup] = {} 48 49 def __repr__(self): 50 return f'Cohort("{self.name}", {len(self._sequencing_group_by_id)} SGs)' 51 52 @property 53 def target_id(self) -> str: 54 """Unique target ID""" 55 return self.name 56 57 def get_cohort_id(self) -> str: 58 """Get the cohort ID""" 59 return self.name 60 61 def write_ped_file( 62 self, 63 out_path: Path | None = None, 64 use_participant_id: bool = False, 65 ) -> Path: 66 """ 67 Create a PED file for all samples in the whole cohort 68 PED is written with no header line to be strict specification compliant 69 """ 70 datas = [] 71 for sequencing_group in self.get_sequencing_groups(): 72 datas.append( 73 sequencing_group.pedigree.get_ped_dict( 74 use_participant_id=use_participant_id, 75 ), 76 ) 77 if not datas: 78 raise ValueError(f'No pedigree data found for {self.name}') 79 df = pd.DataFrame(datas) 80 81 if out_path is None: 82 out_path = self.analysis_dataset.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped' 83 84 if not get_config()['workflow'].get('dry_run', False): 85 with out_path.open('w') as fp: 86 df.to_csv(fp, sep='\t', index=False, header=False) 87 return out_path 88 89 def add_sequencing_group_object( 90 self, 91 s: 'SequencingGroup', 92 allow_duplicates: bool = True, 93 ): 94 """ 95 Add a sequencing group object to the Cohort. 96 Args: 97 s: SequencingGroup object 98 allow_duplicates: if True, allow adding the same object twice 99 """ 100 if s.id in self._sequencing_group_by_id: 101 if allow_duplicates: 102 LOGGER.debug( 103 f'SequencingGroup {s.id} already exists in the Cohort {self.name}', 104 ) 105 return self._sequencing_group_by_id[s.id] 106 raise ValueError( 107 f'SequencingGroup {s.id} already exists in the Cohort {self.name}', 108 ) 109 self._sequencing_group_by_id[s.id] = s 110 111 def get_sequencing_groups( 112 self, 113 only_active: bool = True, 114 ) -> list['SequencingGroup']: 115 """ 116 Gets a flat list of all sequencing groups from all datasets. 117 Include only "active" sequencing groups (unless only_active is False) 118 """ 119 return [s for s in self._sequencing_group_by_id.values() if (s.active or not only_active)] 120 121 def get_job_attrs(self) -> dict: 122 """ 123 Attributes for Hail Batch job. 124 """ 125 return { 126 # 'sequencing_groups': self.get_sequencing_group_ids(), 127 } 128 129 def get_job_prefix(self) -> str: 130 """ 131 Prefix job names. 132 """ 133 return '' 134 135 def to_tsv(self) -> str: 136 """ 137 Export to a parsable TSV file 138 """ 139 assert self.get_sequencing_groups() 140 tsv_path = self.analysis_dataset.tmp_prefix() / 'samples.tsv' 141 df = pd.DataFrame( 142 { 143 's': s.id, 144 'gvcf': s.gvcf or '-', 145 'sex': s.meta.get('sex') or '-', 146 'continental_pop': s.meta.get('continental_pop') or '-', 147 'subcontinental_pop': s.meta.get('subcontinental_pop') or '-', 148 } 149 for s in self.get_sequencing_groups() 150 ).set_index('s', drop=False) 151 with to_path(tsv_path).open('w') as f: 152 df.to_csv(f, index=False, sep='\t', na_rep='NA') 153 return tsv_path
37class Cohort(Target): 38 """ 39 Represents a "cohort" target - all sequencing groups from a single CustomCohort (potentially spanning multiple datasets) in the workflow. 40 Analysis dataset name is required and will be used as the default name for the 41 cohort. 42 """ 43 44 def __init__(self, name: str | None = None) -> None: 45 super().__init__() 46 self.name = name or get_config()['workflow']['dataset'] 47 self.analysis_dataset = Dataset(name=get_config()['workflow']['dataset']) 48 self._sequencing_group_by_id: dict[str, SequencingGroup] = {} 49 50 def __repr__(self): 51 return f'Cohort("{self.name}", {len(self._sequencing_group_by_id)} SGs)' 52 53 @property 54 def target_id(self) -> str: 55 """Unique target ID""" 56 return self.name 57 58 def get_cohort_id(self) -> str: 59 """Get the cohort ID""" 60 return self.name 61 62 def write_ped_file( 63 self, 64 out_path: Path | None = None, 65 use_participant_id: bool = False, 66 ) -> Path: 67 """ 68 Create a PED file for all samples in the whole cohort 69 PED is written with no header line to be strict specification compliant 70 """ 71 datas = [] 72 for sequencing_group in self.get_sequencing_groups(): 73 datas.append( 74 sequencing_group.pedigree.get_ped_dict( 75 use_participant_id=use_participant_id, 76 ), 77 ) 78 if not datas: 79 raise ValueError(f'No pedigree data found for {self.name}') 80 df = pd.DataFrame(datas) 81 82 if out_path is None: 83 out_path = self.analysis_dataset.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped' 84 85 if not get_config()['workflow'].get('dry_run', False): 86 with out_path.open('w') as fp: 87 df.to_csv(fp, sep='\t', index=False, header=False) 88 return out_path 89 90 def add_sequencing_group_object( 91 self, 92 s: 'SequencingGroup', 93 allow_duplicates: bool = True, 94 ): 95 """ 96 Add a sequencing group object to the Cohort. 97 Args: 98 s: SequencingGroup object 99 allow_duplicates: if True, allow adding the same object twice 100 """ 101 if s.id in self._sequencing_group_by_id: 102 if allow_duplicates: 103 LOGGER.debug( 104 f'SequencingGroup {s.id} already exists in the Cohort {self.name}', 105 ) 106 return self._sequencing_group_by_id[s.id] 107 raise ValueError( 108 f'SequencingGroup {s.id} already exists in the Cohort {self.name}', 109 ) 110 self._sequencing_group_by_id[s.id] = s 111 112 def get_sequencing_groups( 113 self, 114 only_active: bool = True, 115 ) -> list['SequencingGroup']: 116 """ 117 Gets a flat list of all sequencing groups from all datasets. 118 Include only "active" sequencing groups (unless only_active is False) 119 """ 120 return [s for s in self._sequencing_group_by_id.values() if (s.active or not only_active)] 121 122 def get_job_attrs(self) -> dict: 123 """ 124 Attributes for Hail Batch job. 125 """ 126 return { 127 # 'sequencing_groups': self.get_sequencing_group_ids(), 128 } 129 130 def get_job_prefix(self) -> str: 131 """ 132 Prefix job names. 133 """ 134 return '' 135 136 def to_tsv(self) -> str: 137 """ 138 Export to a parsable TSV file 139 """ 140 assert self.get_sequencing_groups() 141 tsv_path = self.analysis_dataset.tmp_prefix() / 'samples.tsv' 142 df = pd.DataFrame( 143 { 144 's': s.id, 145 'gvcf': s.gvcf or '-', 146 'sex': s.meta.get('sex') or '-', 147 'continental_pop': s.meta.get('continental_pop') or '-', 148 'subcontinental_pop': s.meta.get('subcontinental_pop') or '-', 149 } 150 for s in self.get_sequencing_groups() 151 ).set_index('s', drop=False) 152 with to_path(tsv_path).open('w') as f: 153 df.to_csv(f, index=False, sep='\t', na_rep='NA') 154 return tsv_path
Represents a "cohort" target - all sequencing groups from a single CustomCohort (potentially spanning multiple datasets) in the workflow. Analysis dataset name is required and will be used as the default name for the cohort.
62 def write_ped_file( 63 self, 64 out_path: Path | None = None, 65 use_participant_id: bool = False, 66 ) -> Path: 67 """ 68 Create a PED file for all samples in the whole cohort 69 PED is written with no header line to be strict specification compliant 70 """ 71 datas = [] 72 for sequencing_group in self.get_sequencing_groups(): 73 datas.append( 74 sequencing_group.pedigree.get_ped_dict( 75 use_participant_id=use_participant_id, 76 ), 77 ) 78 if not datas: 79 raise ValueError(f'No pedigree data found for {self.name}') 80 df = pd.DataFrame(datas) 81 82 if out_path is None: 83 out_path = self.analysis_dataset.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped' 84 85 if not get_config()['workflow'].get('dry_run', False): 86 with out_path.open('w') as fp: 87 df.to_csv(fp, sep='\t', index=False, header=False) 88 return out_path
Create a PED file for all samples in the whole cohort PED is written with no header line to be strict specification compliant
90 def add_sequencing_group_object( 91 self, 92 s: 'SequencingGroup', 93 allow_duplicates: bool = True, 94 ): 95 """ 96 Add a sequencing group object to the Cohort. 97 Args: 98 s: SequencingGroup object 99 allow_duplicates: if True, allow adding the same object twice 100 """ 101 if s.id in self._sequencing_group_by_id: 102 if allow_duplicates: 103 LOGGER.debug( 104 f'SequencingGroup {s.id} already exists in the Cohort {self.name}', 105 ) 106 return self._sequencing_group_by_id[s.id] 107 raise ValueError( 108 f'SequencingGroup {s.id} already exists in the Cohort {self.name}', 109 ) 110 self._sequencing_group_by_id[s.id] = s
Add a sequencing group object to the Cohort. Args: s: SequencingGroup object allow_duplicates: if True, allow adding the same object twice
112 def get_sequencing_groups( 113 self, 114 only_active: bool = True, 115 ) -> list['SequencingGroup']: 116 """ 117 Gets a flat list of all sequencing groups from all datasets. 118 Include only "active" sequencing groups (unless only_active is False) 119 """ 120 return [s for s in self._sequencing_group_by_id.values() if (s.active or not only_active)]
Gets a flat list of all sequencing groups from all datasets. Include only "active" sequencing groups (unless only_active is False)
122 def get_job_attrs(self) -> dict: 123 """ 124 Attributes for Hail Batch job. 125 """ 126 return { 127 # 'sequencing_groups': self.get_sequencing_group_ids(), 128 }
Attributes for Hail Batch job.
136 def to_tsv(self) -> str: 137 """ 138 Export to a parsable TSV file 139 """ 140 assert self.get_sequencing_groups() 141 tsv_path = self.analysis_dataset.tmp_prefix() / 'samples.tsv' 142 df = pd.DataFrame( 143 { 144 's': s.id, 145 'gvcf': s.gvcf or '-', 146 'sex': s.meta.get('sex') or '-', 147 'continental_pop': s.meta.get('continental_pop') or '-', 148 'subcontinental_pop': s.meta.get('subcontinental_pop') or '-', 149 } 150 for s in self.get_sequencing_groups() 151 ).set_index('s', drop=False) 152 with to_path(tsv_path).open('w') as f: 153 df.to_csv(f, index=False, sep='\t', na_rep='NA') 154 return tsv_path
Export to a parsable TSV file