cpg_flow.targets.cohort
This module defines the Cohort
class, which represents a cohort target in the workflow.
A cohort consists of all sequencing groups from a single CustomCohort, potentially spanning multiple datasets.
Classes: Cohort: Represents a cohort target in the workflow.
Usage:
The Cohort
class is used to manage and analyze sequencing groups within a cohort.
It provides methods to add sequencing groups, retrieve sequencing groups, write PED files,
and export cohort data to TSV files.
Example: cohort = Cohort(name="example_cohort") cohort.add_sequencing_group_object(sequencing_group) ped_file_path = cohort.write_ped_file() tsv_file_path = cohort.to_tsv()
1""" 2This module defines the `Cohort` class, which represents a cohort target in the workflow. 3A cohort consists of all sequencing groups from a single CustomCohort, potentially spanning multiple datasets. 4 5Classes: 6 Cohort: Represents a cohort target in the workflow. 7 8Usage: 9 The `Cohort` class is used to manage and analyze sequencing groups within a cohort. 10 It provides methods to add sequencing groups, retrieve sequencing groups, write PED files, 11 and export cohort data to TSV files. 12 13Example: 14 cohort = Cohort(name="example_cohort") 15 cohort.add_sequencing_group_object(sequencing_group) 16 ped_file_path = cohort.write_ped_file() 17 tsv_file_path = cohort.to_tsv() 18 19""" 20 21from typing import TYPE_CHECKING 22 23import pandas as pd 24 25from cpg_flow.targets import Dataset, Target 26from cpg_flow.utils import get_logger 27from cpg_utils import Path, to_path 28from cpg_utils.config import get_config 29 30LOGGER = get_logger(__name__) 31 32if TYPE_CHECKING: 33 from cpg_flow.targets import SequencingGroup 34 35 36class Cohort(Target): 37 """ 38 Represents a "cohort" target - all sequencing groups from a single CustomCohort (potentially spanning multiple datasets) in the workflow. 39 Analysis dataset name is required and will be used as the default name for the 40 cohort. 41 """ 42 43 def __init__(self, id: str | None = None, name: str | None = None) -> None: 44 super().__init__() 45 self.id = id or get_config()['workflow']['dataset'] 46 self.name = name or get_config()['workflow']['dataset'] 47 self.analysis_dataset = Dataset(name=get_config()['workflow']['dataset']) 48 self._sequencing_group_by_id: dict[str, SequencingGroup] = {} 49 50 def __repr__(self): 51 return f'Cohort("{self.id}", {len(self._sequencing_group_by_id)} SGs)' 52 53 @property 54 def target_id(self) -> str: 55 """Unique target ID""" 56 return self.id 57 58 def get_cohort_id(self) -> str: 59 """Get the cohort ID""" 60 return self.id 61 62 def write_ped_file( 63 self, 64 out_path: Path | None = None, 65 use_participant_id: bool = False, 66 ) -> Path: 67 """ 68 Create a PED file for all samples in the whole cohort 69 PED is written with no header line to be strict specification compliant 70 """ 71 datas = [] 72 for sequencing_group in self.get_sequencing_groups(): 73 datas.append( 74 sequencing_group.pedigree.get_ped_dict( 75 use_participant_id=use_participant_id, 76 ), 77 ) 78 if not datas: 79 raise ValueError(f'No pedigree data found for {self.id}') 80 df = pd.DataFrame(datas) 81 82 if out_path is None: 83 out_path = self.analysis_dataset.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped' 84 85 if not get_config()['workflow'].get('dry_run', False): 86 with out_path.open('w') as fp: 87 df.to_csv(fp, sep='\t', index=False, header=False) 88 return out_path 89 90 def add_sequencing_group_object( 91 self, 92 s: 'SequencingGroup', 93 allow_duplicates: bool = True, 94 ): 95 """ 96 Add a sequencing group object to the Cohort. 97 Args: 98 s: SequencingGroup object 99 allow_duplicates: if True, allow adding the same object twice 100 """ 101 if s.id in self._sequencing_group_by_id: 102 if allow_duplicates: 103 LOGGER.debug( 104 f'SequencingGroup {s.id} already exists in the Cohort {self.name}', 105 ) 106 return self._sequencing_group_by_id[s.id] 107 raise ValueError( 108 f'SequencingGroup {s.id} already exists in the Cohort {self.name}', 109 ) 110 self._sequencing_group_by_id[s.id] = s 111 112 def get_sequencing_groups( 113 self, 114 only_active: bool = True, 115 ) -> list['SequencingGroup']: 116 """ 117 Gets a flat list of all sequencing groups from all datasets. 118 Include only "active" sequencing groups (unless only_active is False) 119 """ 120 return [s for s in self._sequencing_group_by_id.values() if (s.active or not only_active)] 121 122 def get_job_attrs(self) -> dict: 123 """ 124 Attributes for Hail Batch job. 125 """ 126 return { 127 # 'sequencing_groups': self.get_sequencing_group_ids(), 128 } 129 130 def get_job_prefix(self) -> str: 131 """ 132 Prefix job names. 133 """ 134 return '' 135 136 def to_tsv(self) -> str: 137 """ 138 Export to a parsable TSV file 139 """ 140 assert self.get_sequencing_groups() 141 tsv_path = self.analysis_dataset.tmp_prefix() / 'samples.tsv' 142 df = pd.DataFrame( 143 { 144 's': s.id, 145 'gvcf': s.gvcf or '-', 146 'sex': s.meta.get('sex') or '-', 147 'continental_pop': s.meta.get('continental_pop') or '-', 148 'subcontinental_pop': s.meta.get('subcontinental_pop') or '-', 149 } 150 for s in self.get_sequencing_groups() 151 ).set_index('s', drop=False) 152 with to_path(tsv_path).open('w') as f: 153 df.to_csv(f, index=False, sep='\t', na_rep='NA') 154 return tsv_path
37class Cohort(Target): 38 """ 39 Represents a "cohort" target - all sequencing groups from a single CustomCohort (potentially spanning multiple datasets) in the workflow. 40 Analysis dataset name is required and will be used as the default name for the 41 cohort. 42 """ 43 44 def __init__(self, id: str | None = None, name: str | None = None) -> None: 45 super().__init__() 46 self.id = id or get_config()['workflow']['dataset'] 47 self.name = name or get_config()['workflow']['dataset'] 48 self.analysis_dataset = Dataset(name=get_config()['workflow']['dataset']) 49 self._sequencing_group_by_id: dict[str, SequencingGroup] = {} 50 51 def __repr__(self): 52 return f'Cohort("{self.id}", {len(self._sequencing_group_by_id)} SGs)' 53 54 @property 55 def target_id(self) -> str: 56 """Unique target ID""" 57 return self.id 58 59 def get_cohort_id(self) -> str: 60 """Get the cohort ID""" 61 return self.id 62 63 def write_ped_file( 64 self, 65 out_path: Path | None = None, 66 use_participant_id: bool = False, 67 ) -> Path: 68 """ 69 Create a PED file for all samples in the whole cohort 70 PED is written with no header line to be strict specification compliant 71 """ 72 datas = [] 73 for sequencing_group in self.get_sequencing_groups(): 74 datas.append( 75 sequencing_group.pedigree.get_ped_dict( 76 use_participant_id=use_participant_id, 77 ), 78 ) 79 if not datas: 80 raise ValueError(f'No pedigree data found for {self.id}') 81 df = pd.DataFrame(datas) 82 83 if out_path is None: 84 out_path = self.analysis_dataset.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped' 85 86 if not get_config()['workflow'].get('dry_run', False): 87 with out_path.open('w') as fp: 88 df.to_csv(fp, sep='\t', index=False, header=False) 89 return out_path 90 91 def add_sequencing_group_object( 92 self, 93 s: 'SequencingGroup', 94 allow_duplicates: bool = True, 95 ): 96 """ 97 Add a sequencing group object to the Cohort. 98 Args: 99 s: SequencingGroup object 100 allow_duplicates: if True, allow adding the same object twice 101 """ 102 if s.id in self._sequencing_group_by_id: 103 if allow_duplicates: 104 LOGGER.debug( 105 f'SequencingGroup {s.id} already exists in the Cohort {self.name}', 106 ) 107 return self._sequencing_group_by_id[s.id] 108 raise ValueError( 109 f'SequencingGroup {s.id} already exists in the Cohort {self.name}', 110 ) 111 self._sequencing_group_by_id[s.id] = s 112 113 def get_sequencing_groups( 114 self, 115 only_active: bool = True, 116 ) -> list['SequencingGroup']: 117 """ 118 Gets a flat list of all sequencing groups from all datasets. 119 Include only "active" sequencing groups (unless only_active is False) 120 """ 121 return [s for s in self._sequencing_group_by_id.values() if (s.active or not only_active)] 122 123 def get_job_attrs(self) -> dict: 124 """ 125 Attributes for Hail Batch job. 126 """ 127 return { 128 # 'sequencing_groups': self.get_sequencing_group_ids(), 129 } 130 131 def get_job_prefix(self) -> str: 132 """ 133 Prefix job names. 134 """ 135 return '' 136 137 def to_tsv(self) -> str: 138 """ 139 Export to a parsable TSV file 140 """ 141 assert self.get_sequencing_groups() 142 tsv_path = self.analysis_dataset.tmp_prefix() / 'samples.tsv' 143 df = pd.DataFrame( 144 { 145 's': s.id, 146 'gvcf': s.gvcf or '-', 147 'sex': s.meta.get('sex') or '-', 148 'continental_pop': s.meta.get('continental_pop') or '-', 149 'subcontinental_pop': s.meta.get('subcontinental_pop') or '-', 150 } 151 for s in self.get_sequencing_groups() 152 ).set_index('s', drop=False) 153 with to_path(tsv_path).open('w') as f: 154 df.to_csv(f, index=False, sep='\t', na_rep='NA') 155 return tsv_path
Represents a "cohort" target - all sequencing groups from a single CustomCohort (potentially spanning multiple datasets) in the workflow. Analysis dataset name is required and will be used as the default name for the cohort.
44 def __init__(self, id: str | None = None, name: str | None = None) -> None: 45 super().__init__() 46 self.id = id or get_config()['workflow']['dataset'] 47 self.name = name or get_config()['workflow']['dataset'] 48 self.analysis_dataset = Dataset(name=get_config()['workflow']['dataset']) 49 self._sequencing_group_by_id: dict[str, SequencingGroup] = {}
63 def write_ped_file( 64 self, 65 out_path: Path | None = None, 66 use_participant_id: bool = False, 67 ) -> Path: 68 """ 69 Create a PED file for all samples in the whole cohort 70 PED is written with no header line to be strict specification compliant 71 """ 72 datas = [] 73 for sequencing_group in self.get_sequencing_groups(): 74 datas.append( 75 sequencing_group.pedigree.get_ped_dict( 76 use_participant_id=use_participant_id, 77 ), 78 ) 79 if not datas: 80 raise ValueError(f'No pedigree data found for {self.id}') 81 df = pd.DataFrame(datas) 82 83 if out_path is None: 84 out_path = self.analysis_dataset.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped' 85 86 if not get_config()['workflow'].get('dry_run', False): 87 with out_path.open('w') as fp: 88 df.to_csv(fp, sep='\t', index=False, header=False) 89 return out_path
Create a PED file for all samples in the whole cohort PED is written with no header line to be strict specification compliant
91 def add_sequencing_group_object( 92 self, 93 s: 'SequencingGroup', 94 allow_duplicates: bool = True, 95 ): 96 """ 97 Add a sequencing group object to the Cohort. 98 Args: 99 s: SequencingGroup object 100 allow_duplicates: if True, allow adding the same object twice 101 """ 102 if s.id in self._sequencing_group_by_id: 103 if allow_duplicates: 104 LOGGER.debug( 105 f'SequencingGroup {s.id} already exists in the Cohort {self.name}', 106 ) 107 return self._sequencing_group_by_id[s.id] 108 raise ValueError( 109 f'SequencingGroup {s.id} already exists in the Cohort {self.name}', 110 ) 111 self._sequencing_group_by_id[s.id] = s
Add a sequencing group object to the Cohort. Args: s: SequencingGroup object allow_duplicates: if True, allow adding the same object twice
113 def get_sequencing_groups( 114 self, 115 only_active: bool = True, 116 ) -> list['SequencingGroup']: 117 """ 118 Gets a flat list of all sequencing groups from all datasets. 119 Include only "active" sequencing groups (unless only_active is False) 120 """ 121 return [s for s in self._sequencing_group_by_id.values() if (s.active or not only_active)]
Gets a flat list of all sequencing groups from all datasets. Include only "active" sequencing groups (unless only_active is False)
123 def get_job_attrs(self) -> dict: 124 """ 125 Attributes for Hail Batch job. 126 """ 127 return { 128 # 'sequencing_groups': self.get_sequencing_group_ids(), 129 }
Attributes for Hail Batch job.
137 def to_tsv(self) -> str: 138 """ 139 Export to a parsable TSV file 140 """ 141 assert self.get_sequencing_groups() 142 tsv_path = self.analysis_dataset.tmp_prefix() / 'samples.tsv' 143 df = pd.DataFrame( 144 { 145 's': s.id, 146 'gvcf': s.gvcf or '-', 147 'sex': s.meta.get('sex') or '-', 148 'continental_pop': s.meta.get('continental_pop') or '-', 149 'subcontinental_pop': s.meta.get('subcontinental_pop') or '-', 150 } 151 for s in self.get_sequencing_groups() 152 ).set_index('s', drop=False) 153 with to_path(tsv_path).open('w') as f: 154 df.to_csv(f, index=False, sep='\t', na_rep='NA') 155 return tsv_path
Export to a parsable TSV file