cpg_flow.targets.sequencing_group

  1from typing import TYPE_CHECKING, Optional
  2
  3from cpg_flow.filetypes import AlignmentInput, BamPath, CramPath, FastqPairs, GvcfPath
  4from cpg_flow.metamist import Assay
  5from cpg_flow.targets import PedigreeInfo, Sex, Target
  6from cpg_utils import Path
  7from cpg_utils.config import reference_path
  8
  9if TYPE_CHECKING:
 10    from cpg_flow.targets import Dataset
 11
 12
 13class SequencingGroup(Target):
 14    """
 15    Represents a sequencing group.
 16    """
 17
 18    def __init__(
 19        self,
 20        id: str,
 21        dataset: 'Dataset',
 22        *,
 23        sequencing_type: str,
 24        sequencing_technology: str,
 25        sequencing_platform: str,
 26        external_id: str | None = None,
 27        participant_id: str | None = None,
 28        meta: dict | None = None,
 29        sex: Sex | None = None,
 30        pedigree: Optional['PedigreeInfo'] = None,
 31        alignment_input: AlignmentInput | None = None,
 32        assays: tuple[Assay, ...] | None = None,
 33        forced: bool = False,
 34    ):
 35        super().__init__()
 36        self.id = id
 37        self.name = id
 38        self._external_id = external_id
 39        self.sequencing_type = sequencing_type
 40        self.sequencing_technology = sequencing_technology
 41        self.sequencing_platform = sequencing_platform
 42
 43        self.dataset = dataset
 44        self._participant_id = participant_id
 45        self.meta: dict = meta or dict()
 46        self.pedigree: PedigreeInfo = pedigree or PedigreeInfo(
 47            sequencing_group=self,
 48            fam_id=self.participant_id,
 49            sex=sex or Sex.UNKNOWN,
 50        )
 51        if sex:
 52            self.pedigree.sex = sex
 53        self.alignment_input: AlignmentInput | None = alignment_input
 54        self.assays: tuple[Assay, ...] | None = assays
 55        self.forced = forced
 56        self.active = True
 57        # Only set if the file exists / found in Metamist:
 58        self.gvcf: GvcfPath | None = None
 59        self.cram: CramPath | None = None
 60
 61    def __repr__(self):
 62        values = {
 63            'participant': self._participant_id if self._participant_id else '',
 64            'sequencing_type': self.sequencing_type,
 65            'sequencing_technology': self.sequencing_technology,
 66            'sequencing_platform': self.sequencing_platform,
 67            'forced': str(self.forced),
 68            'active': str(self.active),
 69            'meta': str(self.meta),
 70            'alignment_inputs': self.alignment_input,
 71            'pedigree': self.pedigree,
 72        }
 73        retval = f'SequencingGroup({self.dataset.name}/{self.id}'
 74        if self._external_id:
 75            retval += f'|{self._external_id}'
 76        return retval + ''.join(f', {k}={v}' for k, v in values.items())
 77
 78    def __str__(self):
 79        ai_tag = ''
 80        if self.alignment_input:
 81            ai_tag += f'|SEQ={self.sequencing_type}:'
 82            if isinstance(self.alignment_input, CramPath):
 83                ai_tag += 'CRAM'
 84            elif isinstance(self.alignment_input, BamPath):
 85                ai_tag += 'BAM'
 86            elif isinstance(self.alignment_input, FastqPairs):
 87                ai_tag += f'{len(self.alignment_input)}FQS'
 88            else:
 89                raise ValueError(
 90                    f'Unrecognised alignment input type {type(self.alignment_input)}',
 91                )
 92
 93        ext_id = f'|{self._external_id}' if self._external_id else ''
 94        return f'SequencingGroup({self.dataset.name}/{self.id}{ext_id}{ai_tag})'
 95
 96    @property
 97    def participant_id(self) -> str:
 98        """
 99        Get ID of participant corresponding to this sequencing group,
100        or substitute it with external ID.
101        """
102        return self._participant_id or self.external_id
103
104    @participant_id.setter
105    def participant_id(self, val: str):
106        """
107        Set participant ID.
108        """
109        self._participant_id = val
110
111    @property
112    def external_id(self) -> str:
113        """
114        Get external sample ID, or substitute it with the internal ID.
115        """
116        return self._external_id or self.id
117
118    @property
119    def rich_id(self) -> str:
120        """
121        ID for reporting purposes: composed of internal as well as external
122        or participant IDs.
123        """
124        return self.id + '|' + self.participant_id
125
126    def get_ped_dict(self, use_participant_id: bool = False) -> dict[str, str]:
127        """
128        Returns a dictionary of pedigree fields for this sequencing group, corresponding
129        a PED file entry.
130        """
131        return self.pedigree.get_ped_dict(use_participant_id)
132
133    def make_cram_path(self) -> CramPath:
134        """
135        Path to a CRAM file. Not checking its existence here.
136        """
137        path = self.dataset.prefix() / 'cram' / f'{self.id}.cram'
138        return CramPath(
139            path=path,
140            index_path=path.with_suffix('.cram.crai'),
141            reference_assembly=reference_path('broad/ref_fasta'),
142        )
143
144    def make_gvcf_path(self) -> GvcfPath:
145        """
146        Path to a GVCF file. Not checking its existence here.
147        """
148        return GvcfPath(self.dataset.prefix() / 'gvcf' / f'{self.id}.g.vcf.gz')
149
150    @property
151    def make_sv_evidence_path(self) -> Path:
152        """
153        Path to the evidence root for GATK-SV evidence files.
154        """
155        return self.dataset.prefix() / 'sv_evidence'
156
157    @property
158    def target_id(self) -> str:
159        """Unique target ID"""
160        return self.id
161
162    def get_sequencing_groups(
163        self,
164        only_active: bool = True,
165    ) -> list['SequencingGroup']:
166        """
167        Implementing the abstract method.
168        """
169        if only_active and not self.active:
170            return []
171        return [self]
172
173    def get_job_attrs(self) -> dict:
174        """
175        Attributes for Hail Batch job.
176        """
177        attrs = {
178            'dataset': self.dataset.name,
179            'sequencing_group': self.id,
180        }
181        _participant_id: str | None = self._participant_id or self._external_id
182        if _participant_id:
183            attrs['participant_id'] = _participant_id
184        return attrs
185
186    def get_job_prefix(self) -> str:
187        """
188        Prefix job names.
189        """
190        return f'{self.dataset.name}/{self.id}: '
class SequencingGroup(cpg_flow.targets.target.Target):
 14class SequencingGroup(Target):
 15    """
 16    Represents a sequencing group.
 17    """
 18
 19    def __init__(
 20        self,
 21        id: str,
 22        dataset: 'Dataset',
 23        *,
 24        sequencing_type: str,
 25        sequencing_technology: str,
 26        sequencing_platform: str,
 27        external_id: str | None = None,
 28        participant_id: str | None = None,
 29        meta: dict | None = None,
 30        sex: Sex | None = None,
 31        pedigree: Optional['PedigreeInfo'] = None,
 32        alignment_input: AlignmentInput | None = None,
 33        assays: tuple[Assay, ...] | None = None,
 34        forced: bool = False,
 35    ):
 36        super().__init__()
 37        self.id = id
 38        self.name = id
 39        self._external_id = external_id
 40        self.sequencing_type = sequencing_type
 41        self.sequencing_technology = sequencing_technology
 42        self.sequencing_platform = sequencing_platform
 43
 44        self.dataset = dataset
 45        self._participant_id = participant_id
 46        self.meta: dict = meta or dict()
 47        self.pedigree: PedigreeInfo = pedigree or PedigreeInfo(
 48            sequencing_group=self,
 49            fam_id=self.participant_id,
 50            sex=sex or Sex.UNKNOWN,
 51        )
 52        if sex:
 53            self.pedigree.sex = sex
 54        self.alignment_input: AlignmentInput | None = alignment_input
 55        self.assays: tuple[Assay, ...] | None = assays
 56        self.forced = forced
 57        self.active = True
 58        # Only set if the file exists / found in Metamist:
 59        self.gvcf: GvcfPath | None = None
 60        self.cram: CramPath | None = None
 61
 62    def __repr__(self):
 63        values = {
 64            'participant': self._participant_id if self._participant_id else '',
 65            'sequencing_type': self.sequencing_type,
 66            'sequencing_technology': self.sequencing_technology,
 67            'sequencing_platform': self.sequencing_platform,
 68            'forced': str(self.forced),
 69            'active': str(self.active),
 70            'meta': str(self.meta),
 71            'alignment_inputs': self.alignment_input,
 72            'pedigree': self.pedigree,
 73        }
 74        retval = f'SequencingGroup({self.dataset.name}/{self.id}'
 75        if self._external_id:
 76            retval += f'|{self._external_id}'
 77        return retval + ''.join(f', {k}={v}' for k, v in values.items())
 78
 79    def __str__(self):
 80        ai_tag = ''
 81        if self.alignment_input:
 82            ai_tag += f'|SEQ={self.sequencing_type}:'
 83            if isinstance(self.alignment_input, CramPath):
 84                ai_tag += 'CRAM'
 85            elif isinstance(self.alignment_input, BamPath):
 86                ai_tag += 'BAM'
 87            elif isinstance(self.alignment_input, FastqPairs):
 88                ai_tag += f'{len(self.alignment_input)}FQS'
 89            else:
 90                raise ValueError(
 91                    f'Unrecognised alignment input type {type(self.alignment_input)}',
 92                )
 93
 94        ext_id = f'|{self._external_id}' if self._external_id else ''
 95        return f'SequencingGroup({self.dataset.name}/{self.id}{ext_id}{ai_tag})'
 96
 97    @property
 98    def participant_id(self) -> str:
 99        """
100        Get ID of participant corresponding to this sequencing group,
101        or substitute it with external ID.
102        """
103        return self._participant_id or self.external_id
104
105    @participant_id.setter
106    def participant_id(self, val: str):
107        """
108        Set participant ID.
109        """
110        self._participant_id = val
111
112    @property
113    def external_id(self) -> str:
114        """
115        Get external sample ID, or substitute it with the internal ID.
116        """
117        return self._external_id or self.id
118
119    @property
120    def rich_id(self) -> str:
121        """
122        ID for reporting purposes: composed of internal as well as external
123        or participant IDs.
124        """
125        return self.id + '|' + self.participant_id
126
127    def get_ped_dict(self, use_participant_id: bool = False) -> dict[str, str]:
128        """
129        Returns a dictionary of pedigree fields for this sequencing group, corresponding
130        a PED file entry.
131        """
132        return self.pedigree.get_ped_dict(use_participant_id)
133
134    def make_cram_path(self) -> CramPath:
135        """
136        Path to a CRAM file. Not checking its existence here.
137        """
138        path = self.dataset.prefix() / 'cram' / f'{self.id}.cram'
139        return CramPath(
140            path=path,
141            index_path=path.with_suffix('.cram.crai'),
142            reference_assembly=reference_path('broad/ref_fasta'),
143        )
144
145    def make_gvcf_path(self) -> GvcfPath:
146        """
147        Path to a GVCF file. Not checking its existence here.
148        """
149        return GvcfPath(self.dataset.prefix() / 'gvcf' / f'{self.id}.g.vcf.gz')
150
151    @property
152    def make_sv_evidence_path(self) -> Path:
153        """
154        Path to the evidence root for GATK-SV evidence files.
155        """
156        return self.dataset.prefix() / 'sv_evidence'
157
158    @property
159    def target_id(self) -> str:
160        """Unique target ID"""
161        return self.id
162
163    def get_sequencing_groups(
164        self,
165        only_active: bool = True,
166    ) -> list['SequencingGroup']:
167        """
168        Implementing the abstract method.
169        """
170        if only_active and not self.active:
171            return []
172        return [self]
173
174    def get_job_attrs(self) -> dict:
175        """
176        Attributes for Hail Batch job.
177        """
178        attrs = {
179            'dataset': self.dataset.name,
180            'sequencing_group': self.id,
181        }
182        _participant_id: str | None = self._participant_id or self._external_id
183        if _participant_id:
184            attrs['participant_id'] = _participant_id
185        return attrs
186
187    def get_job_prefix(self) -> str:
188        """
189        Prefix job names.
190        """
191        return f'{self.dataset.name}/{self.id}: '

Represents a sequencing group.

SequencingGroup( id: str, dataset: cpg_flow.targets.dataset.Dataset, *, sequencing_type: str, sequencing_technology: str, sequencing_platform: str, external_id: str | None = None, participant_id: str | None = None, meta: dict | None = None, sex: cpg_flow.targets.types.Sex | None = None, pedigree: Optional[cpg_flow.targets.pedigree_info.PedigreeInfo] = None, alignment_input: cpg_flow.filetypes.AlignmentInput | None = None, assays: tuple[cpg_flow.metamist.Assay, ...] | None = None, forced: bool = False)
19    def __init__(
20        self,
21        id: str,
22        dataset: 'Dataset',
23        *,
24        sequencing_type: str,
25        sequencing_technology: str,
26        sequencing_platform: str,
27        external_id: str | None = None,
28        participant_id: str | None = None,
29        meta: dict | None = None,
30        sex: Sex | None = None,
31        pedigree: Optional['PedigreeInfo'] = None,
32        alignment_input: AlignmentInput | None = None,
33        assays: tuple[Assay, ...] | None = None,
34        forced: bool = False,
35    ):
36        super().__init__()
37        self.id = id
38        self.name = id
39        self._external_id = external_id
40        self.sequencing_type = sequencing_type
41        self.sequencing_technology = sequencing_technology
42        self.sequencing_platform = sequencing_platform
43
44        self.dataset = dataset
45        self._participant_id = participant_id
46        self.meta: dict = meta or dict()
47        self.pedigree: PedigreeInfo = pedigree or PedigreeInfo(
48            sequencing_group=self,
49            fam_id=self.participant_id,
50            sex=sex or Sex.UNKNOWN,
51        )
52        if sex:
53            self.pedigree.sex = sex
54        self.alignment_input: AlignmentInput | None = alignment_input
55        self.assays: tuple[Assay, ...] | None = assays
56        self.forced = forced
57        self.active = True
58        # Only set if the file exists / found in Metamist:
59        self.gvcf: GvcfPath | None = None
60        self.cram: CramPath | None = None
id
name
sequencing_type
sequencing_technology
sequencing_platform
dataset
meta: dict
alignment_input: cpg_flow.filetypes.AlignmentInput | None
assays: tuple[cpg_flow.metamist.Assay, ...] | None
forced
active
gvcf: cpg_flow.filetypes.GvcfPath | None
cram: cpg_flow.filetypes.CramPath | None
participant_id: str
 97    @property
 98    def participant_id(self) -> str:
 99        """
100        Get ID of participant corresponding to this sequencing group,
101        or substitute it with external ID.
102        """
103        return self._participant_id or self.external_id

Get ID of participant corresponding to this sequencing group, or substitute it with external ID.

external_id: str
112    @property
113    def external_id(self) -> str:
114        """
115        Get external sample ID, or substitute it with the internal ID.
116        """
117        return self._external_id or self.id

Get external sample ID, or substitute it with the internal ID.

rich_id: str
119    @property
120    def rich_id(self) -> str:
121        """
122        ID for reporting purposes: composed of internal as well as external
123        or participant IDs.
124        """
125        return self.id + '|' + self.participant_id

ID for reporting purposes: composed of internal as well as external or participant IDs.

def get_ped_dict(self, use_participant_id: bool = False) -> dict[str, str]:
127    def get_ped_dict(self, use_participant_id: bool = False) -> dict[str, str]:
128        """
129        Returns a dictionary of pedigree fields for this sequencing group, corresponding
130        a PED file entry.
131        """
132        return self.pedigree.get_ped_dict(use_participant_id)

Returns a dictionary of pedigree fields for this sequencing group, corresponding a PED file entry.

def make_cram_path(self) -> cpg_flow.filetypes.CramPath:
134    def make_cram_path(self) -> CramPath:
135        """
136        Path to a CRAM file. Not checking its existence here.
137        """
138        path = self.dataset.prefix() / 'cram' / f'{self.id}.cram'
139        return CramPath(
140            path=path,
141            index_path=path.with_suffix('.cram.crai'),
142            reference_assembly=reference_path('broad/ref_fasta'),
143        )

Path to a CRAM file. Not checking its existence here.

def make_gvcf_path(self) -> cpg_flow.filetypes.GvcfPath:
145    def make_gvcf_path(self) -> GvcfPath:
146        """
147        Path to a GVCF file. Not checking its existence here.
148        """
149        return GvcfPath(self.dataset.prefix() / 'gvcf' / f'{self.id}.g.vcf.gz')

Path to a GVCF file. Not checking its existence here.

make_sv_evidence_path: cloudpathlib.cloudpath.CloudPath | pathlib.Path
151    @property
152    def make_sv_evidence_path(self) -> Path:
153        """
154        Path to the evidence root for GATK-SV evidence files.
155        """
156        return self.dataset.prefix() / 'sv_evidence'

Path to the evidence root for GATK-SV evidence files.

target_id: str
158    @property
159    def target_id(self) -> str:
160        """Unique target ID"""
161        return self.id

Unique target ID

def get_sequencing_groups( self, only_active: bool = True) -> list[SequencingGroup]:
163    def get_sequencing_groups(
164        self,
165        only_active: bool = True,
166    ) -> list['SequencingGroup']:
167        """
168        Implementing the abstract method.
169        """
170        if only_active and not self.active:
171            return []
172        return [self]

Implementing the abstract method.

def get_job_attrs(self) -> dict:
174    def get_job_attrs(self) -> dict:
175        """
176        Attributes for Hail Batch job.
177        """
178        attrs = {
179            'dataset': self.dataset.name,
180            'sequencing_group': self.id,
181        }
182        _participant_id: str | None = self._participant_id or self._external_id
183        if _participant_id:
184            attrs['participant_id'] = _participant_id
185        return attrs

Attributes for Hail Batch job.

def get_job_prefix(self) -> str:
187    def get_job_prefix(self) -> str:
188        """
189        Prefix job names.
190        """
191        return f'{self.dataset.name}/{self.id}: '

Prefix job names.