cpg_flow.targets.cohort

This module defines the Cohort class, which represents a cohort target in the workflow. A cohort consists of all sequencing groups from a single CustomCohort, potentially spanning multiple datasets.

Classes: Cohort: Represents a cohort target in the workflow.

Usage: The Cohort class is used to manage and analyze sequencing groups within a cohort. It provides methods to add sequencing groups, retrieve sequencing groups, write PED files, and export cohort data to TSV files.

Example: cohort = Cohort(name="example_cohort") cohort.add_sequencing_group_object(sequencing_group) ped_file_path = cohort.write_ped_file() tsv_file_path = cohort.to_tsv()

  1"""
  2This module defines the `Cohort` class, which represents a cohort target in the workflow.
  3A cohort consists of all sequencing groups from a single CustomCohort, potentially spanning multiple datasets.
  4
  5Classes:
  6    Cohort: Represents a cohort target in the workflow.
  7
  8Usage:
  9    The `Cohort` class is used to manage and analyze sequencing groups within a cohort.
 10    It provides methods to add sequencing groups, retrieve sequencing groups, write PED files,
 11    and export cohort data to TSV files.
 12
 13Example:
 14    cohort = Cohort(name="example_cohort")
 15    cohort.add_sequencing_group_object(sequencing_group)
 16    ped_file_path = cohort.write_ped_file()
 17    tsv_file_path = cohort.to_tsv()
 18
 19"""
 20
 21from typing import TYPE_CHECKING
 22
 23import pandas as pd
 24
 25from cpg_flow.targets import Dataset, Target
 26from cpg_flow.utils import get_logger
 27from cpg_utils import Path, to_path
 28from cpg_utils.config import get_config
 29
 30LOGGER = get_logger(__name__)
 31
 32if TYPE_CHECKING:
 33    from cpg_flow.targets import SequencingGroup
 34
 35
 36class Cohort(Target):
 37    """
 38    Represents a "cohort" target - all sequencing groups from a single CustomCohort (potentially spanning multiple datasets) in the workflow.
 39    Analysis dataset name is required and will be used as the default name for the
 40    cohort.
 41    """
 42
 43    def __init__(self, id: str | None = None, name: str | None = None) -> None:
 44        super().__init__()
 45        self.id = id or get_config()['workflow']['dataset']
 46        self.name = name or get_config()['workflow']['dataset']
 47        self.analysis_dataset = Dataset(name=get_config()['workflow']['dataset'])
 48        self._sequencing_group_by_id: dict[str, SequencingGroup] = {}
 49
 50    def __repr__(self):
 51        return f'Cohort("{self.id}", {len(self._sequencing_group_by_id)} SGs)'
 52
 53    @property
 54    def target_id(self) -> str:
 55        """Unique target ID"""
 56        return self.id
 57
 58    def get_cohort_id(self) -> str:
 59        """Get the cohort ID"""
 60        return self.id
 61
 62    def write_ped_file(
 63        self,
 64        out_path: Path | None = None,
 65        use_participant_id: bool = False,
 66    ) -> Path:
 67        """
 68        Create a PED file for all samples in the whole cohort
 69        PED is written with no header line to be strict specification compliant
 70        """
 71        datas = []
 72        for sequencing_group in self.get_sequencing_groups():
 73            datas.append(
 74                sequencing_group.pedigree.get_ped_dict(
 75                    use_participant_id=use_participant_id,
 76                ),
 77            )
 78        if not datas:
 79            raise ValueError(f'No pedigree data found for {self.id}')
 80        df = pd.DataFrame(datas)
 81
 82        if out_path is None:
 83            out_path = self.analysis_dataset.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped'
 84
 85        if not get_config()['workflow'].get('dry_run', False):
 86            with out_path.open('w') as fp:
 87                df.to_csv(fp, sep='\t', index=False, header=False)
 88        return out_path
 89
 90    def add_sequencing_group_object(
 91        self,
 92        s: 'SequencingGroup',
 93        allow_duplicates: bool = True,
 94    ):
 95        """
 96        Add a sequencing group object to the Cohort.
 97        Args:
 98            s: SequencingGroup object
 99            allow_duplicates: if True, allow adding the same object twice
100        """
101        if s.id in self._sequencing_group_by_id:
102            if allow_duplicates:
103                LOGGER.debug(
104                    f'SequencingGroup {s.id} already exists in the Cohort {self.name}',
105                )
106                return self._sequencing_group_by_id[s.id]
107            raise ValueError(
108                f'SequencingGroup {s.id} already exists in the Cohort {self.name}',
109            )
110        self._sequencing_group_by_id[s.id] = s
111
112    def get_sequencing_groups(
113        self,
114        only_active: bool = True,
115    ) -> list['SequencingGroup']:
116        """
117        Gets a flat list of all sequencing groups from all datasets.
118        Include only "active" sequencing groups (unless only_active is False)
119        """
120        return [s for s in self._sequencing_group_by_id.values() if (s.active or not only_active)]
121
122    def get_job_attrs(self) -> dict:
123        """
124        Attributes for Hail Batch job.
125        """
126        return {
127            # 'sequencing_groups': self.get_sequencing_group_ids(),
128        }
129
130    def get_job_prefix(self) -> str:
131        """
132        Prefix job names.
133        """
134        return ''
135
136    def to_tsv(self) -> str:
137        """
138        Export to a parsable TSV file
139        """
140        assert self.get_sequencing_groups()
141        tsv_path = self.analysis_dataset.tmp_prefix() / 'samples.tsv'
142        df = pd.DataFrame(
143            {
144                's': s.id,
145                'gvcf': s.gvcf or '-',
146                'sex': s.meta.get('sex') or '-',
147                'continental_pop': s.meta.get('continental_pop') or '-',
148                'subcontinental_pop': s.meta.get('subcontinental_pop') or '-',
149            }
150            for s in self.get_sequencing_groups()
151        ).set_index('s', drop=False)
152        with to_path(tsv_path).open('w') as f:
153            df.to_csv(f, index=False, sep='\t', na_rep='NA')
154        return tsv_path
LOGGER = <Logger cpg_flow.targets.cohort (INFO)>
class Cohort(cpg_flow.targets.target.Target):
 37class Cohort(Target):
 38    """
 39    Represents a "cohort" target - all sequencing groups from a single CustomCohort (potentially spanning multiple datasets) in the workflow.
 40    Analysis dataset name is required and will be used as the default name for the
 41    cohort.
 42    """
 43
 44    def __init__(self, id: str | None = None, name: str | None = None) -> None:
 45        super().__init__()
 46        self.id = id or get_config()['workflow']['dataset']
 47        self.name = name or get_config()['workflow']['dataset']
 48        self.analysis_dataset = Dataset(name=get_config()['workflow']['dataset'])
 49        self._sequencing_group_by_id: dict[str, SequencingGroup] = {}
 50
 51    def __repr__(self):
 52        return f'Cohort("{self.id}", {len(self._sequencing_group_by_id)} SGs)'
 53
 54    @property
 55    def target_id(self) -> str:
 56        """Unique target ID"""
 57        return self.id
 58
 59    def get_cohort_id(self) -> str:
 60        """Get the cohort ID"""
 61        return self.id
 62
 63    def write_ped_file(
 64        self,
 65        out_path: Path | None = None,
 66        use_participant_id: bool = False,
 67    ) -> Path:
 68        """
 69        Create a PED file for all samples in the whole cohort
 70        PED is written with no header line to be strict specification compliant
 71        """
 72        datas = []
 73        for sequencing_group in self.get_sequencing_groups():
 74            datas.append(
 75                sequencing_group.pedigree.get_ped_dict(
 76                    use_participant_id=use_participant_id,
 77                ),
 78            )
 79        if not datas:
 80            raise ValueError(f'No pedigree data found for {self.id}')
 81        df = pd.DataFrame(datas)
 82
 83        if out_path is None:
 84            out_path = self.analysis_dataset.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped'
 85
 86        if not get_config()['workflow'].get('dry_run', False):
 87            with out_path.open('w') as fp:
 88                df.to_csv(fp, sep='\t', index=False, header=False)
 89        return out_path
 90
 91    def add_sequencing_group_object(
 92        self,
 93        s: 'SequencingGroup',
 94        allow_duplicates: bool = True,
 95    ):
 96        """
 97        Add a sequencing group object to the Cohort.
 98        Args:
 99            s: SequencingGroup object
100            allow_duplicates: if True, allow adding the same object twice
101        """
102        if s.id in self._sequencing_group_by_id:
103            if allow_duplicates:
104                LOGGER.debug(
105                    f'SequencingGroup {s.id} already exists in the Cohort {self.name}',
106                )
107                return self._sequencing_group_by_id[s.id]
108            raise ValueError(
109                f'SequencingGroup {s.id} already exists in the Cohort {self.name}',
110            )
111        self._sequencing_group_by_id[s.id] = s
112
113    def get_sequencing_groups(
114        self,
115        only_active: bool = True,
116    ) -> list['SequencingGroup']:
117        """
118        Gets a flat list of all sequencing groups from all datasets.
119        Include only "active" sequencing groups (unless only_active is False)
120        """
121        return [s for s in self._sequencing_group_by_id.values() if (s.active or not only_active)]
122
123    def get_job_attrs(self) -> dict:
124        """
125        Attributes for Hail Batch job.
126        """
127        return {
128            # 'sequencing_groups': self.get_sequencing_group_ids(),
129        }
130
131    def get_job_prefix(self) -> str:
132        """
133        Prefix job names.
134        """
135        return ''
136
137    def to_tsv(self) -> str:
138        """
139        Export to a parsable TSV file
140        """
141        assert self.get_sequencing_groups()
142        tsv_path = self.analysis_dataset.tmp_prefix() / 'samples.tsv'
143        df = pd.DataFrame(
144            {
145                's': s.id,
146                'gvcf': s.gvcf or '-',
147                'sex': s.meta.get('sex') or '-',
148                'continental_pop': s.meta.get('continental_pop') or '-',
149                'subcontinental_pop': s.meta.get('subcontinental_pop') or '-',
150            }
151            for s in self.get_sequencing_groups()
152        ).set_index('s', drop=False)
153        with to_path(tsv_path).open('w') as f:
154            df.to_csv(f, index=False, sep='\t', na_rep='NA')
155        return tsv_path

Represents a "cohort" target - all sequencing groups from a single CustomCohort (potentially spanning multiple datasets) in the workflow. Analysis dataset name is required and will be used as the default name for the cohort.

Cohort(id: str | None = None, name: str | None = None)
44    def __init__(self, id: str | None = None, name: str | None = None) -> None:
45        super().__init__()
46        self.id = id or get_config()['workflow']['dataset']
47        self.name = name or get_config()['workflow']['dataset']
48        self.analysis_dataset = Dataset(name=get_config()['workflow']['dataset'])
49        self._sequencing_group_by_id: dict[str, SequencingGroup] = {}
id
name
analysis_dataset
target_id: str
54    @property
55    def target_id(self) -> str:
56        """Unique target ID"""
57        return self.id

Unique target ID

def get_cohort_id(self) -> str:
59    def get_cohort_id(self) -> str:
60        """Get the cohort ID"""
61        return self.id

Get the cohort ID

def write_ped_file( self, out_path: cloudpathlib.cloudpath.CloudPath | pathlib.Path | None = None, use_participant_id: bool = False) -> cloudpathlib.cloudpath.CloudPath | pathlib.Path:
63    def write_ped_file(
64        self,
65        out_path: Path | None = None,
66        use_participant_id: bool = False,
67    ) -> Path:
68        """
69        Create a PED file for all samples in the whole cohort
70        PED is written with no header line to be strict specification compliant
71        """
72        datas = []
73        for sequencing_group in self.get_sequencing_groups():
74            datas.append(
75                sequencing_group.pedigree.get_ped_dict(
76                    use_participant_id=use_participant_id,
77                ),
78            )
79        if not datas:
80            raise ValueError(f'No pedigree data found for {self.id}')
81        df = pd.DataFrame(datas)
82
83        if out_path is None:
84            out_path = self.analysis_dataset.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped'
85
86        if not get_config()['workflow'].get('dry_run', False):
87            with out_path.open('w') as fp:
88                df.to_csv(fp, sep='\t', index=False, header=False)
89        return out_path

Create a PED file for all samples in the whole cohort PED is written with no header line to be strict specification compliant

def add_sequencing_group_object( self, s: cpg_flow.targets.sequencing_group.SequencingGroup, allow_duplicates: bool = True):
 91    def add_sequencing_group_object(
 92        self,
 93        s: 'SequencingGroup',
 94        allow_duplicates: bool = True,
 95    ):
 96        """
 97        Add a sequencing group object to the Cohort.
 98        Args:
 99            s: SequencingGroup object
100            allow_duplicates: if True, allow adding the same object twice
101        """
102        if s.id in self._sequencing_group_by_id:
103            if allow_duplicates:
104                LOGGER.debug(
105                    f'SequencingGroup {s.id} already exists in the Cohort {self.name}',
106                )
107                return self._sequencing_group_by_id[s.id]
108            raise ValueError(
109                f'SequencingGroup {s.id} already exists in the Cohort {self.name}',
110            )
111        self._sequencing_group_by_id[s.id] = s

Add a sequencing group object to the Cohort. Args: s: SequencingGroup object allow_duplicates: if True, allow adding the same object twice

def get_sequencing_groups( self, only_active: bool = True) -> list[cpg_flow.targets.sequencing_group.SequencingGroup]:
113    def get_sequencing_groups(
114        self,
115        only_active: bool = True,
116    ) -> list['SequencingGroup']:
117        """
118        Gets a flat list of all sequencing groups from all datasets.
119        Include only "active" sequencing groups (unless only_active is False)
120        """
121        return [s for s in self._sequencing_group_by_id.values() if (s.active or not only_active)]

Gets a flat list of all sequencing groups from all datasets. Include only "active" sequencing groups (unless only_active is False)

def get_job_attrs(self) -> dict:
123    def get_job_attrs(self) -> dict:
124        """
125        Attributes for Hail Batch job.
126        """
127        return {
128            # 'sequencing_groups': self.get_sequencing_group_ids(),
129        }

Attributes for Hail Batch job.

def get_job_prefix(self) -> str:
131    def get_job_prefix(self) -> str:
132        """
133        Prefix job names.
134        """
135        return ''

Prefix job names.

def to_tsv(self) -> str:
137    def to_tsv(self) -> str:
138        """
139        Export to a parsable TSV file
140        """
141        assert self.get_sequencing_groups()
142        tsv_path = self.analysis_dataset.tmp_prefix() / 'samples.tsv'
143        df = pd.DataFrame(
144            {
145                's': s.id,
146                'gvcf': s.gvcf or '-',
147                'sex': s.meta.get('sex') or '-',
148                'continental_pop': s.meta.get('continental_pop') or '-',
149                'subcontinental_pop': s.meta.get('subcontinental_pop') or '-',
150            }
151            for s in self.get_sequencing_groups()
152        ).set_index('s', drop=False)
153        with to_path(tsv_path).open('w') as f:
154            df.to_csv(f, index=False, sep='\t', na_rep='NA')
155        return tsv_path

Export to a parsable TSV file