cpg_flow.targets.sequencing_group
1from typing import TYPE_CHECKING, Optional 2 3from cpg_flow.filetypes import AlignmentInput, BamPath, CramPath, FastqPairs, GvcfPath 4from cpg_flow.metamist import Assay 5from cpg_flow.targets import PedigreeInfo, Sex, Target 6from cpg_utils import Path 7from cpg_utils.config import reference_path 8 9if TYPE_CHECKING: 10 from cpg_flow.targets import Dataset 11 12 13class SequencingGroup(Target): 14 """ 15 Represents a sequencing group. 16 """ 17 18 def __init__( 19 self, 20 id: str, 21 dataset: 'Dataset', 22 *, 23 sequencing_type: str, 24 sequencing_technology: str, 25 sequencing_platform: str, 26 external_id: str | None = None, 27 participant_id: str | None = None, 28 meta: dict | None = None, 29 sex: Sex | None = None, 30 pedigree: Optional['PedigreeInfo'] = None, 31 alignment_input: AlignmentInput | None = None, 32 assays: tuple[Assay, ...] | None = None, 33 forced: bool = False, 34 ): 35 super().__init__() 36 self.id = id 37 self.name = id 38 self._external_id = external_id 39 self.sequencing_type = sequencing_type 40 self.sequencing_technology = sequencing_technology 41 self.sequencing_platform = sequencing_platform 42 43 self.dataset = dataset 44 self._participant_id = participant_id 45 self.meta: dict = meta or dict() 46 self.pedigree: PedigreeInfo = pedigree or PedigreeInfo( 47 sequencing_group=self, 48 fam_id=self.participant_id, 49 sex=sex or Sex.UNKNOWN, 50 ) 51 if sex: 52 self.pedigree.sex = sex 53 self.alignment_input: AlignmentInput | None = alignment_input 54 self.assays: tuple[Assay, ...] | None = assays 55 self.forced = forced 56 self.active = True 57 # Only set if the file exists / found in Metamist: 58 self.gvcf: GvcfPath | None = None 59 self.cram: CramPath | None = None 60 61 def __repr__(self): 62 values = { 63 'participant': self._participant_id if self._participant_id else '', 64 'sequencing_type': self.sequencing_type, 65 'sequencing_technology': self.sequencing_technology, 66 'sequencing_platform': self.sequencing_platform, 67 'forced': str(self.forced), 68 'active': str(self.active), 69 'meta': str(self.meta), 70 'alignment_inputs': self.alignment_input, 71 'pedigree': self.pedigree, 72 } 73 retval = f'SequencingGroup({self.dataset.name}/{self.id}' 74 if self._external_id: 75 retval += f'|{self._external_id}' 76 return retval + ''.join(f', {k}={v}' for k, v in values.items()) 77 78 def __str__(self): 79 ai_tag = '' 80 if self.alignment_input: 81 ai_tag += f'|SEQ={self.sequencing_type}:' 82 if isinstance(self.alignment_input, CramPath): 83 ai_tag += 'CRAM' 84 elif isinstance(self.alignment_input, BamPath): 85 ai_tag += 'BAM' 86 elif isinstance(self.alignment_input, FastqPairs): 87 ai_tag += f'{len(self.alignment_input)}FQS' 88 else: 89 raise ValueError( 90 f'Unrecognised alignment input type {type(self.alignment_input)}', 91 ) 92 93 ext_id = f'|{self._external_id}' if self._external_id else '' 94 return f'SequencingGroup({self.dataset.name}/{self.id}{ext_id}{ai_tag})' 95 96 @property 97 def participant_id(self) -> str: 98 """ 99 Get ID of participant corresponding to this sequencing group, 100 or substitute it with external ID. 101 """ 102 return self._participant_id or self.external_id 103 104 @participant_id.setter 105 def participant_id(self, val: str): 106 """ 107 Set participant ID. 108 """ 109 self._participant_id = val 110 111 @property 112 def external_id(self) -> str: 113 """ 114 Get external sample ID, or substitute it with the internal ID. 115 """ 116 return self._external_id or self.id 117 118 @property 119 def rich_id(self) -> str: 120 """ 121 ID for reporting purposes: composed of internal as well as external 122 or participant IDs. 123 """ 124 return self.id + '|' + self.participant_id 125 126 def get_ped_dict(self, use_participant_id: bool = False) -> dict[str, str]: 127 """ 128 Returns a dictionary of pedigree fields for this sequencing group, corresponding 129 a PED file entry. 130 """ 131 return self.pedigree.get_ped_dict(use_participant_id) 132 133 def make_cram_path(self) -> CramPath: 134 """ 135 Path to a CRAM file. Not checking its existence here. 136 """ 137 path = self.dataset.prefix() / 'cram' / f'{self.id}.cram' 138 return CramPath( 139 path=path, 140 index_path=path.with_suffix('.cram.crai'), 141 reference_assembly=reference_path('broad/ref_fasta'), 142 ) 143 144 def make_gvcf_path(self) -> GvcfPath: 145 """ 146 Path to a GVCF file. Not checking its existence here. 147 """ 148 return GvcfPath(self.dataset.prefix() / 'gvcf' / f'{self.id}.g.vcf.gz') 149 150 @property 151 def make_sv_evidence_path(self) -> Path: 152 """ 153 Path to the evidence root for GATK-SV evidence files. 154 """ 155 return self.dataset.prefix() / 'sv_evidence' 156 157 @property 158 def target_id(self) -> str: 159 """Unique target ID""" 160 return self.id 161 162 def get_sequencing_groups( 163 self, 164 only_active: bool = True, 165 ) -> list['SequencingGroup']: 166 """ 167 Implementing the abstract method. 168 """ 169 if only_active and not self.active: 170 return [] 171 return [self] 172 173 def get_job_attrs(self) -> dict: 174 """ 175 Attributes for Hail Batch job. 176 """ 177 attrs = { 178 'dataset': self.dataset.name, 179 'sequencing_group': self.id, 180 } 181 _participant_id: str | None = self._participant_id or self._external_id 182 if _participant_id: 183 attrs['participant_id'] = _participant_id 184 return attrs 185 186 def get_job_prefix(self) -> str: 187 """ 188 Prefix job names. 189 """ 190 return f'{self.dataset.name}/{self.id}: '
14class SequencingGroup(Target): 15 """ 16 Represents a sequencing group. 17 """ 18 19 def __init__( 20 self, 21 id: str, 22 dataset: 'Dataset', 23 *, 24 sequencing_type: str, 25 sequencing_technology: str, 26 sequencing_platform: str, 27 external_id: str | None = None, 28 participant_id: str | None = None, 29 meta: dict | None = None, 30 sex: Sex | None = None, 31 pedigree: Optional['PedigreeInfo'] = None, 32 alignment_input: AlignmentInput | None = None, 33 assays: tuple[Assay, ...] | None = None, 34 forced: bool = False, 35 ): 36 super().__init__() 37 self.id = id 38 self.name = id 39 self._external_id = external_id 40 self.sequencing_type = sequencing_type 41 self.sequencing_technology = sequencing_technology 42 self.sequencing_platform = sequencing_platform 43 44 self.dataset = dataset 45 self._participant_id = participant_id 46 self.meta: dict = meta or dict() 47 self.pedigree: PedigreeInfo = pedigree or PedigreeInfo( 48 sequencing_group=self, 49 fam_id=self.participant_id, 50 sex=sex or Sex.UNKNOWN, 51 ) 52 if sex: 53 self.pedigree.sex = sex 54 self.alignment_input: AlignmentInput | None = alignment_input 55 self.assays: tuple[Assay, ...] | None = assays 56 self.forced = forced 57 self.active = True 58 # Only set if the file exists / found in Metamist: 59 self.gvcf: GvcfPath | None = None 60 self.cram: CramPath | None = None 61 62 def __repr__(self): 63 values = { 64 'participant': self._participant_id if self._participant_id else '', 65 'sequencing_type': self.sequencing_type, 66 'sequencing_technology': self.sequencing_technology, 67 'sequencing_platform': self.sequencing_platform, 68 'forced': str(self.forced), 69 'active': str(self.active), 70 'meta': str(self.meta), 71 'alignment_inputs': self.alignment_input, 72 'pedigree': self.pedigree, 73 } 74 retval = f'SequencingGroup({self.dataset.name}/{self.id}' 75 if self._external_id: 76 retval += f'|{self._external_id}' 77 return retval + ''.join(f', {k}={v}' for k, v in values.items()) 78 79 def __str__(self): 80 ai_tag = '' 81 if self.alignment_input: 82 ai_tag += f'|SEQ={self.sequencing_type}:' 83 if isinstance(self.alignment_input, CramPath): 84 ai_tag += 'CRAM' 85 elif isinstance(self.alignment_input, BamPath): 86 ai_tag += 'BAM' 87 elif isinstance(self.alignment_input, FastqPairs): 88 ai_tag += f'{len(self.alignment_input)}FQS' 89 else: 90 raise ValueError( 91 f'Unrecognised alignment input type {type(self.alignment_input)}', 92 ) 93 94 ext_id = f'|{self._external_id}' if self._external_id else '' 95 return f'SequencingGroup({self.dataset.name}/{self.id}{ext_id}{ai_tag})' 96 97 @property 98 def participant_id(self) -> str: 99 """ 100 Get ID of participant corresponding to this sequencing group, 101 or substitute it with external ID. 102 """ 103 return self._participant_id or self.external_id 104 105 @participant_id.setter 106 def participant_id(self, val: str): 107 """ 108 Set participant ID. 109 """ 110 self._participant_id = val 111 112 @property 113 def external_id(self) -> str: 114 """ 115 Get external sample ID, or substitute it with the internal ID. 116 """ 117 return self._external_id or self.id 118 119 @property 120 def rich_id(self) -> str: 121 """ 122 ID for reporting purposes: composed of internal as well as external 123 or participant IDs. 124 """ 125 return self.id + '|' + self.participant_id 126 127 def get_ped_dict(self, use_participant_id: bool = False) -> dict[str, str]: 128 """ 129 Returns a dictionary of pedigree fields for this sequencing group, corresponding 130 a PED file entry. 131 """ 132 return self.pedigree.get_ped_dict(use_participant_id) 133 134 def make_cram_path(self) -> CramPath: 135 """ 136 Path to a CRAM file. Not checking its existence here. 137 """ 138 path = self.dataset.prefix() / 'cram' / f'{self.id}.cram' 139 return CramPath( 140 path=path, 141 index_path=path.with_suffix('.cram.crai'), 142 reference_assembly=reference_path('broad/ref_fasta'), 143 ) 144 145 def make_gvcf_path(self) -> GvcfPath: 146 """ 147 Path to a GVCF file. Not checking its existence here. 148 """ 149 return GvcfPath(self.dataset.prefix() / 'gvcf' / f'{self.id}.g.vcf.gz') 150 151 @property 152 def make_sv_evidence_path(self) -> Path: 153 """ 154 Path to the evidence root for GATK-SV evidence files. 155 """ 156 return self.dataset.prefix() / 'sv_evidence' 157 158 @property 159 def target_id(self) -> str: 160 """Unique target ID""" 161 return self.id 162 163 def get_sequencing_groups( 164 self, 165 only_active: bool = True, 166 ) -> list['SequencingGroup']: 167 """ 168 Implementing the abstract method. 169 """ 170 if only_active and not self.active: 171 return [] 172 return [self] 173 174 def get_job_attrs(self) -> dict: 175 """ 176 Attributes for Hail Batch job. 177 """ 178 attrs = { 179 'dataset': self.dataset.name, 180 'sequencing_group': self.id, 181 } 182 _participant_id: str | None = self._participant_id or self._external_id 183 if _participant_id: 184 attrs['participant_id'] = _participant_id 185 return attrs 186 187 def get_job_prefix(self) -> str: 188 """ 189 Prefix job names. 190 """ 191 return f'{self.dataset.name}/{self.id}: '
Represents a sequencing group.
SequencingGroup( id: str, dataset: cpg_flow.targets.dataset.Dataset, *, sequencing_type: str, sequencing_technology: str, sequencing_platform: str, external_id: str | None = None, participant_id: str | None = None, meta: dict | None = None, sex: cpg_flow.targets.types.Sex | None = None, pedigree: Optional[cpg_flow.targets.pedigree_info.PedigreeInfo] = None, alignment_input: cpg_flow.filetypes.AlignmentInput | None = None, assays: tuple[cpg_flow.metamist.Assay, ...] | None = None, forced: bool = False)
19 def __init__( 20 self, 21 id: str, 22 dataset: 'Dataset', 23 *, 24 sequencing_type: str, 25 sequencing_technology: str, 26 sequencing_platform: str, 27 external_id: str | None = None, 28 participant_id: str | None = None, 29 meta: dict | None = None, 30 sex: Sex | None = None, 31 pedigree: Optional['PedigreeInfo'] = None, 32 alignment_input: AlignmentInput | None = None, 33 assays: tuple[Assay, ...] | None = None, 34 forced: bool = False, 35 ): 36 super().__init__() 37 self.id = id 38 self.name = id 39 self._external_id = external_id 40 self.sequencing_type = sequencing_type 41 self.sequencing_technology = sequencing_technology 42 self.sequencing_platform = sequencing_platform 43 44 self.dataset = dataset 45 self._participant_id = participant_id 46 self.meta: dict = meta or dict() 47 self.pedigree: PedigreeInfo = pedigree or PedigreeInfo( 48 sequencing_group=self, 49 fam_id=self.participant_id, 50 sex=sex or Sex.UNKNOWN, 51 ) 52 if sex: 53 self.pedigree.sex = sex 54 self.alignment_input: AlignmentInput | None = alignment_input 55 self.assays: tuple[Assay, ...] | None = assays 56 self.forced = forced 57 self.active = True 58 # Only set if the file exists / found in Metamist: 59 self.gvcf: GvcfPath | None = None 60 self.cram: CramPath | None = None
participant_id: str
97 @property 98 def participant_id(self) -> str: 99 """ 100 Get ID of participant corresponding to this sequencing group, 101 or substitute it with external ID. 102 """ 103 return self._participant_id or self.external_id
Get ID of participant corresponding to this sequencing group, or substitute it with external ID.
external_id: str
112 @property 113 def external_id(self) -> str: 114 """ 115 Get external sample ID, or substitute it with the internal ID. 116 """ 117 return self._external_id or self.id
Get external sample ID, or substitute it with the internal ID.
rich_id: str
119 @property 120 def rich_id(self) -> str: 121 """ 122 ID for reporting purposes: composed of internal as well as external 123 or participant IDs. 124 """ 125 return self.id + '|' + self.participant_id
ID for reporting purposes: composed of internal as well as external or participant IDs.
def
get_ped_dict(self, use_participant_id: bool = False) -> dict[str, str]:
127 def get_ped_dict(self, use_participant_id: bool = False) -> dict[str, str]: 128 """ 129 Returns a dictionary of pedigree fields for this sequencing group, corresponding 130 a PED file entry. 131 """ 132 return self.pedigree.get_ped_dict(use_participant_id)
Returns a dictionary of pedigree fields for this sequencing group, corresponding a PED file entry.
def
make_cram_path(self) -> cpg_flow.filetypes.CramPath:
134 def make_cram_path(self) -> CramPath: 135 """ 136 Path to a CRAM file. Not checking its existence here. 137 """ 138 path = self.dataset.prefix() / 'cram' / f'{self.id}.cram' 139 return CramPath( 140 path=path, 141 index_path=path.with_suffix('.cram.crai'), 142 reference_assembly=reference_path('broad/ref_fasta'), 143 )
Path to a CRAM file. Not checking its existence here.
def
make_gvcf_path(self) -> cpg_flow.filetypes.GvcfPath:
145 def make_gvcf_path(self) -> GvcfPath: 146 """ 147 Path to a GVCF file. Not checking its existence here. 148 """ 149 return GvcfPath(self.dataset.prefix() / 'gvcf' / f'{self.id}.g.vcf.gz')
Path to a GVCF file. Not checking its existence here.
make_sv_evidence_path: cloudpathlib.cloudpath.CloudPath | pathlib.Path
151 @property 152 def make_sv_evidence_path(self) -> Path: 153 """ 154 Path to the evidence root for GATK-SV evidence files. 155 """ 156 return self.dataset.prefix() / 'sv_evidence'
Path to the evidence root for GATK-SV evidence files.
163 def get_sequencing_groups( 164 self, 165 only_active: bool = True, 166 ) -> list['SequencingGroup']: 167 """ 168 Implementing the abstract method. 169 """ 170 if only_active and not self.active: 171 return [] 172 return [self]
Implementing the abstract method.
def
get_job_attrs(self) -> dict:
174 def get_job_attrs(self) -> dict: 175 """ 176 Attributes for Hail Batch job. 177 """ 178 attrs = { 179 'dataset': self.dataset.name, 180 'sequencing_group': self.id, 181 } 182 _participant_id: str | None = self._participant_id or self._external_id 183 if _participant_id: 184 attrs['participant_id'] = _participant_id 185 return attrs
Attributes for Hail Batch job.