cpg_flow.targets.multicohort

This module defines classes for handling multi-cohort and cohort targets in a workflow. It includes functionality for managing datasets, sequencing groups, and generating PED files.

Classes: MultiCohort: Represents a multi-cohort target with multiple cohorts in the workflow. Cohort: Represents a cohort target with all sequencing groups from a single CustomCohort. Dataset: Represents a CPG dataset. Sex: Enum for representing sex as in PED format. SequencingGroup: Represents a sequencing group. PedigreeInfo: Represents pedigree relationships and other PED data.

Functions: seq_type_subdir: Returns a subdirectory parametrized by sequencing type.

  1"""
  2This module defines classes for handling multi-cohort and cohort targets in a workflow.
  3It includes functionality for managing datasets, sequencing groups, and generating PED files.
  4
  5Classes:
  6    MultiCohort: Represents a multi-cohort target with multiple cohorts in the workflow.
  7    Cohort: Represents a cohort target with all sequencing groups from a single CustomCohort.
  8    Dataset: Represents a CPG dataset.
  9    Sex: Enum for representing sex as in PED format.
 10    SequencingGroup: Represents a sequencing group.
 11    PedigreeInfo: Represents pedigree relationships and other PED data.
 12
 13Functions:
 14    seq_type_subdir: Returns a subdirectory parametrized by sequencing type.
 15"""
 16
 17from typing import TYPE_CHECKING, Optional
 18
 19import pandas as pd
 20
 21from cpg_flow.targets import Cohort, Dataset, Target
 22from cpg_flow.utils import get_logger
 23from cpg_utils import Path
 24from cpg_utils.config import get_config
 25
 26LOGGER = get_logger(__name__)
 27
 28if TYPE_CHECKING:
 29    from cpg_flow.targets import SequencingGroup
 30
 31
 32class MultiCohort(Target):
 33    """
 34    Represents a "multi-cohort" target - multiple cohorts in the workflow.
 35    """
 36
 37    def __init__(self) -> None:
 38        super().__init__()
 39
 40        # NOTE: For a cohort, we simply pull the dataset name from the config.
 41        input_cohorts = get_config()['workflow'].get('input_cohorts', [])
 42        if input_cohorts:
 43            self.name = '_'.join(sorted(input_cohorts))
 44        else:
 45            self.name = get_config()['workflow']['dataset']
 46
 47        assert self.name, 'Ensure cohorts or dataset is defined in the config file.'
 48
 49        self._cohorts_by_name: dict[str, Cohort] = {}
 50        self._datasets_by_name: dict[str, Dataset] = {}
 51        self.analysis_dataset = Dataset(name=get_config()['workflow']['dataset'])
 52
 53    def __repr__(self):
 54        return f'MultiCohort({len(self.get_cohorts())} cohorts)'
 55
 56    @property
 57    def target_id(self) -> str:
 58        """Unique target ID"""
 59        return self.name
 60
 61    def create_dataset(self, name: str) -> 'Dataset':
 62        """
 63        Create a dataset and add it to the cohort.
 64        """
 65        if name in self._datasets_by_name:
 66            return self._datasets_by_name[name]
 67
 68        if name == self.analysis_dataset.name:
 69            ds = self.analysis_dataset
 70        else:
 71            ds = Dataset(name=name)
 72
 73        self._datasets_by_name[ds.name] = ds
 74        return ds
 75
 76    def get_cohorts(self, only_active: bool = True) -> list['Cohort']:
 77        """
 78        Gets list of all cohorts.
 79        Include only "active" cohorts (unless only_active is False)
 80        """
 81        cohorts = list(self._cohorts_by_name.values())
 82        if only_active:
 83            cohorts = [c for c in cohorts if c.active]
 84        return cohorts
 85
 86    def get_cohort_ids(self, only_active: bool = True) -> list['str']:
 87        """
 88        Get list of cohort IDs.
 89        Include only "active" cohorts (unless only_active is False)
 90        """
 91        return [c.get_cohort_id() for c in self.get_cohorts(only_active)]
 92
 93    def get_cohort_by_name(
 94        self,
 95        name: str,
 96        only_active: bool = True,
 97    ) -> Optional['Cohort']:
 98        """
 99        Get cohort by name.
100        Include only "active" cohorts (unless only_active is False)
101        """
102        cohort = self._cohorts_by_name.get(name)
103        if not cohort:
104            LOGGER.warning(f'Cohort {name} not found in the multi-cohort')
105
106        if not only_active:  # Return cohort even if it's inactive
107            return cohort
108        if isinstance(cohort, Cohort) and cohort.active:
109            return cohort
110        return None
111
112    def get_datasets(self, only_active: bool = True) -> list['Dataset']:
113        """
114        Gets list of all datasets.
115        Include only "active" datasets (unless only_active is False)
116        """
117        all_datasets = list(self._datasets_by_name.values())
118        if only_active:
119            all_datasets = [d for d in all_datasets if d.active and d.get_sequencing_groups()]
120        return all_datasets
121
122    def get_sequencing_groups(
123        self,
124        only_active: bool = True,
125    ) -> list['SequencingGroup']:
126        """
127        Gets a flat list of all sequencing groups from all datasets.
128        uses a dictionary to avoid duplicates (we could have the same sequencing group in multiple cohorts)
129        Include only "active" sequencing groups (unless only_active is False)
130        """
131        all_sequencing_groups: dict[str, SequencingGroup] = {}
132        for dataset in self.get_datasets(only_active):
133            for sg in dataset.get_sequencing_groups(only_active):
134                all_sequencing_groups[sg.id] = sg
135        return list(all_sequencing_groups.values())
136
137    def create_cohort(self, name: str):
138        """
139        Create a cohort and add it to the multi-cohort.
140        """
141        if name in self._cohorts_by_name:
142            LOGGER.debug(f'Cohort {name} already exists in the multi-cohort')
143            return self._cohorts_by_name[name]
144
145        c = Cohort(name=name)
146        self._cohorts_by_name[c.name] = c
147        return c
148
149    def add_dataset(self, d: 'Dataset') -> 'Dataset':
150        """
151        Add a Dataset to the MultiCohort
152        Args:
153            d: Dataset object
154        """
155        if d.name in self._datasets_by_name:
156            LOGGER.debug(
157                f'Dataset {d.name} already exists in the MultiCohort {self.name}',
158            )
159        else:
160            # We need create a new dataset to avoid manipulating the cohort dataset at this point
161            self._datasets_by_name[d.name] = Dataset(d.name)
162        return self._datasets_by_name[d.name]
163
164    def get_dataset_by_name(
165        self,
166        name: str,
167        only_active: bool = True,
168    ) -> Optional['Dataset']:
169        """
170        Get dataset by name.
171        Include only "active" datasets (unless only_active is False)
172        """
173        ds_by_name = {d.name: d for d in self.get_datasets(only_active)}
174        return ds_by_name.get(name)
175
176    def get_job_attrs(self) -> dict:
177        """
178        Attributes for Hail Batch job.
179        """
180        return {
181            # 'sequencing_groups': self.get_sequencing_group_ids(),
182            'datasets': [d.name for d in self.get_datasets()],
183            'cohorts': [c.name for c in self.get_cohorts()],
184        }
185
186    def write_ped_file(
187        self,
188        out_path: Path | None = None,
189        use_participant_id: bool = False,
190    ) -> Path:
191        """
192        Create a PED file for all samples in the whole MultiCohort
193        Duplication of the Cohort method
194        PED is written with no header line to be strict specification compliant
195        """
196        datas = []
197        for sequencing_group in self.get_sequencing_groups():
198            datas.append(
199                sequencing_group.pedigree.get_ped_dict(
200                    use_participant_id=use_participant_id,
201                ),
202            )
203        if not datas:
204            raise ValueError(f'No pedigree data found for {self.name}')
205        df = pd.DataFrame(datas)
206
207        if out_path is None:
208            out_path = self.analysis_dataset.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped'
209
210        if not get_config()['workflow'].get('dry_run', False):
211            with out_path.open('w') as fp:
212                df.to_csv(fp, sep='\t', index=False, header=False)
213        return out_path
LOGGER = <Logger cpg_flow.targets.multicohort (INFO)>
class MultiCohort(cpg_flow.targets.target.Target):
 33class MultiCohort(Target):
 34    """
 35    Represents a "multi-cohort" target - multiple cohorts in the workflow.
 36    """
 37
 38    def __init__(self) -> None:
 39        super().__init__()
 40
 41        # NOTE: For a cohort, we simply pull the dataset name from the config.
 42        input_cohorts = get_config()['workflow'].get('input_cohorts', [])
 43        if input_cohorts:
 44            self.name = '_'.join(sorted(input_cohorts))
 45        else:
 46            self.name = get_config()['workflow']['dataset']
 47
 48        assert self.name, 'Ensure cohorts or dataset is defined in the config file.'
 49
 50        self._cohorts_by_name: dict[str, Cohort] = {}
 51        self._datasets_by_name: dict[str, Dataset] = {}
 52        self.analysis_dataset = Dataset(name=get_config()['workflow']['dataset'])
 53
 54    def __repr__(self):
 55        return f'MultiCohort({len(self.get_cohorts())} cohorts)'
 56
 57    @property
 58    def target_id(self) -> str:
 59        """Unique target ID"""
 60        return self.name
 61
 62    def create_dataset(self, name: str) -> 'Dataset':
 63        """
 64        Create a dataset and add it to the cohort.
 65        """
 66        if name in self._datasets_by_name:
 67            return self._datasets_by_name[name]
 68
 69        if name == self.analysis_dataset.name:
 70            ds = self.analysis_dataset
 71        else:
 72            ds = Dataset(name=name)
 73
 74        self._datasets_by_name[ds.name] = ds
 75        return ds
 76
 77    def get_cohorts(self, only_active: bool = True) -> list['Cohort']:
 78        """
 79        Gets list of all cohorts.
 80        Include only "active" cohorts (unless only_active is False)
 81        """
 82        cohorts = list(self._cohorts_by_name.values())
 83        if only_active:
 84            cohorts = [c for c in cohorts if c.active]
 85        return cohorts
 86
 87    def get_cohort_ids(self, only_active: bool = True) -> list['str']:
 88        """
 89        Get list of cohort IDs.
 90        Include only "active" cohorts (unless only_active is False)
 91        """
 92        return [c.get_cohort_id() for c in self.get_cohorts(only_active)]
 93
 94    def get_cohort_by_name(
 95        self,
 96        name: str,
 97        only_active: bool = True,
 98    ) -> Optional['Cohort']:
 99        """
100        Get cohort by name.
101        Include only "active" cohorts (unless only_active is False)
102        """
103        cohort = self._cohorts_by_name.get(name)
104        if not cohort:
105            LOGGER.warning(f'Cohort {name} not found in the multi-cohort')
106
107        if not only_active:  # Return cohort even if it's inactive
108            return cohort
109        if isinstance(cohort, Cohort) and cohort.active:
110            return cohort
111        return None
112
113    def get_datasets(self, only_active: bool = True) -> list['Dataset']:
114        """
115        Gets list of all datasets.
116        Include only "active" datasets (unless only_active is False)
117        """
118        all_datasets = list(self._datasets_by_name.values())
119        if only_active:
120            all_datasets = [d for d in all_datasets if d.active and d.get_sequencing_groups()]
121        return all_datasets
122
123    def get_sequencing_groups(
124        self,
125        only_active: bool = True,
126    ) -> list['SequencingGroup']:
127        """
128        Gets a flat list of all sequencing groups from all datasets.
129        uses a dictionary to avoid duplicates (we could have the same sequencing group in multiple cohorts)
130        Include only "active" sequencing groups (unless only_active is False)
131        """
132        all_sequencing_groups: dict[str, SequencingGroup] = {}
133        for dataset in self.get_datasets(only_active):
134            for sg in dataset.get_sequencing_groups(only_active):
135                all_sequencing_groups[sg.id] = sg
136        return list(all_sequencing_groups.values())
137
138    def create_cohort(self, name: str):
139        """
140        Create a cohort and add it to the multi-cohort.
141        """
142        if name in self._cohorts_by_name:
143            LOGGER.debug(f'Cohort {name} already exists in the multi-cohort')
144            return self._cohorts_by_name[name]
145
146        c = Cohort(name=name)
147        self._cohorts_by_name[c.name] = c
148        return c
149
150    def add_dataset(self, d: 'Dataset') -> 'Dataset':
151        """
152        Add a Dataset to the MultiCohort
153        Args:
154            d: Dataset object
155        """
156        if d.name in self._datasets_by_name:
157            LOGGER.debug(
158                f'Dataset {d.name} already exists in the MultiCohort {self.name}',
159            )
160        else:
161            # We need create a new dataset to avoid manipulating the cohort dataset at this point
162            self._datasets_by_name[d.name] = Dataset(d.name)
163        return self._datasets_by_name[d.name]
164
165    def get_dataset_by_name(
166        self,
167        name: str,
168        only_active: bool = True,
169    ) -> Optional['Dataset']:
170        """
171        Get dataset by name.
172        Include only "active" datasets (unless only_active is False)
173        """
174        ds_by_name = {d.name: d for d in self.get_datasets(only_active)}
175        return ds_by_name.get(name)
176
177    def get_job_attrs(self) -> dict:
178        """
179        Attributes for Hail Batch job.
180        """
181        return {
182            # 'sequencing_groups': self.get_sequencing_group_ids(),
183            'datasets': [d.name for d in self.get_datasets()],
184            'cohorts': [c.name for c in self.get_cohorts()],
185        }
186
187    def write_ped_file(
188        self,
189        out_path: Path | None = None,
190        use_participant_id: bool = False,
191    ) -> Path:
192        """
193        Create a PED file for all samples in the whole MultiCohort
194        Duplication of the Cohort method
195        PED is written with no header line to be strict specification compliant
196        """
197        datas = []
198        for sequencing_group in self.get_sequencing_groups():
199            datas.append(
200                sequencing_group.pedigree.get_ped_dict(
201                    use_participant_id=use_participant_id,
202                ),
203            )
204        if not datas:
205            raise ValueError(f'No pedigree data found for {self.name}')
206        df = pd.DataFrame(datas)
207
208        if out_path is None:
209            out_path = self.analysis_dataset.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped'
210
211        if not get_config()['workflow'].get('dry_run', False):
212            with out_path.open('w') as fp:
213                df.to_csv(fp, sep='\t', index=False, header=False)
214        return out_path

Represents a "multi-cohort" target - multiple cohorts in the workflow.

analysis_dataset
target_id: str
57    @property
58    def target_id(self) -> str:
59        """Unique target ID"""
60        return self.name

Unique target ID

def create_dataset(self, name: str) -> cpg_flow.targets.dataset.Dataset:
62    def create_dataset(self, name: str) -> 'Dataset':
63        """
64        Create a dataset and add it to the cohort.
65        """
66        if name in self._datasets_by_name:
67            return self._datasets_by_name[name]
68
69        if name == self.analysis_dataset.name:
70            ds = self.analysis_dataset
71        else:
72            ds = Dataset(name=name)
73
74        self._datasets_by_name[ds.name] = ds
75        return ds

Create a dataset and add it to the cohort.

def get_cohorts(self, only_active: bool = True) -> list[cpg_flow.targets.cohort.Cohort]:
77    def get_cohorts(self, only_active: bool = True) -> list['Cohort']:
78        """
79        Gets list of all cohorts.
80        Include only "active" cohorts (unless only_active is False)
81        """
82        cohorts = list(self._cohorts_by_name.values())
83        if only_active:
84            cohorts = [c for c in cohorts if c.active]
85        return cohorts

Gets list of all cohorts. Include only "active" cohorts (unless only_active is False)

def get_cohort_ids(self, only_active: bool = True) -> list[str]:
87    def get_cohort_ids(self, only_active: bool = True) -> list['str']:
88        """
89        Get list of cohort IDs.
90        Include only "active" cohorts (unless only_active is False)
91        """
92        return [c.get_cohort_id() for c in self.get_cohorts(only_active)]

Get list of cohort IDs. Include only "active" cohorts (unless only_active is False)

def get_cohort_by_name( self, name: str, only_active: bool = True) -> Optional[cpg_flow.targets.cohort.Cohort]:
 94    def get_cohort_by_name(
 95        self,
 96        name: str,
 97        only_active: bool = True,
 98    ) -> Optional['Cohort']:
 99        """
100        Get cohort by name.
101        Include only "active" cohorts (unless only_active is False)
102        """
103        cohort = self._cohorts_by_name.get(name)
104        if not cohort:
105            LOGGER.warning(f'Cohort {name} not found in the multi-cohort')
106
107        if not only_active:  # Return cohort even if it's inactive
108            return cohort
109        if isinstance(cohort, Cohort) and cohort.active:
110            return cohort
111        return None

Get cohort by name. Include only "active" cohorts (unless only_active is False)

def get_datasets(self, only_active: bool = True) -> list[cpg_flow.targets.dataset.Dataset]:
113    def get_datasets(self, only_active: bool = True) -> list['Dataset']:
114        """
115        Gets list of all datasets.
116        Include only "active" datasets (unless only_active is False)
117        """
118        all_datasets = list(self._datasets_by_name.values())
119        if only_active:
120            all_datasets = [d for d in all_datasets if d.active and d.get_sequencing_groups()]
121        return all_datasets

Gets list of all datasets. Include only "active" datasets (unless only_active is False)

def get_sequencing_groups( self, only_active: bool = True) -> list[cpg_flow.targets.sequencing_group.SequencingGroup]:
123    def get_sequencing_groups(
124        self,
125        only_active: bool = True,
126    ) -> list['SequencingGroup']:
127        """
128        Gets a flat list of all sequencing groups from all datasets.
129        uses a dictionary to avoid duplicates (we could have the same sequencing group in multiple cohorts)
130        Include only "active" sequencing groups (unless only_active is False)
131        """
132        all_sequencing_groups: dict[str, SequencingGroup] = {}
133        for dataset in self.get_datasets(only_active):
134            for sg in dataset.get_sequencing_groups(only_active):
135                all_sequencing_groups[sg.id] = sg
136        return list(all_sequencing_groups.values())

Gets a flat list of all sequencing groups from all datasets. uses a dictionary to avoid duplicates (we could have the same sequencing group in multiple cohorts) Include only "active" sequencing groups (unless only_active is False)

def create_cohort(self, name: str):
138    def create_cohort(self, name: str):
139        """
140        Create a cohort and add it to the multi-cohort.
141        """
142        if name in self._cohorts_by_name:
143            LOGGER.debug(f'Cohort {name} already exists in the multi-cohort')
144            return self._cohorts_by_name[name]
145
146        c = Cohort(name=name)
147        self._cohorts_by_name[c.name] = c
148        return c

Create a cohort and add it to the multi-cohort.

def add_dataset( self, d: cpg_flow.targets.dataset.Dataset) -> cpg_flow.targets.dataset.Dataset:
150    def add_dataset(self, d: 'Dataset') -> 'Dataset':
151        """
152        Add a Dataset to the MultiCohort
153        Args:
154            d: Dataset object
155        """
156        if d.name in self._datasets_by_name:
157            LOGGER.debug(
158                f'Dataset {d.name} already exists in the MultiCohort {self.name}',
159            )
160        else:
161            # We need create a new dataset to avoid manipulating the cohort dataset at this point
162            self._datasets_by_name[d.name] = Dataset(d.name)
163        return self._datasets_by_name[d.name]

Add a Dataset to the MultiCohort Args: d: Dataset object

def get_dataset_by_name( self, name: str, only_active: bool = True) -> Optional[cpg_flow.targets.dataset.Dataset]:
165    def get_dataset_by_name(
166        self,
167        name: str,
168        only_active: bool = True,
169    ) -> Optional['Dataset']:
170        """
171        Get dataset by name.
172        Include only "active" datasets (unless only_active is False)
173        """
174        ds_by_name = {d.name: d for d in self.get_datasets(only_active)}
175        return ds_by_name.get(name)

Get dataset by name. Include only "active" datasets (unless only_active is False)

def get_job_attrs(self) -> dict:
177    def get_job_attrs(self) -> dict:
178        """
179        Attributes for Hail Batch job.
180        """
181        return {
182            # 'sequencing_groups': self.get_sequencing_group_ids(),
183            'datasets': [d.name for d in self.get_datasets()],
184            'cohorts': [c.name for c in self.get_cohorts()],
185        }

Attributes for Hail Batch job.

def write_ped_file( self, out_path: cloudpathlib.cloudpath.CloudPath | pathlib.Path | None = None, use_participant_id: bool = False) -> cloudpathlib.cloudpath.CloudPath | pathlib.Path:
187    def write_ped_file(
188        self,
189        out_path: Path | None = None,
190        use_participant_id: bool = False,
191    ) -> Path:
192        """
193        Create a PED file for all samples in the whole MultiCohort
194        Duplication of the Cohort method
195        PED is written with no header line to be strict specification compliant
196        """
197        datas = []
198        for sequencing_group in self.get_sequencing_groups():
199            datas.append(
200                sequencing_group.pedigree.get_ped_dict(
201                    use_participant_id=use_participant_id,
202                ),
203            )
204        if not datas:
205            raise ValueError(f'No pedigree data found for {self.name}')
206        df = pd.DataFrame(datas)
207
208        if out_path is None:
209            out_path = self.analysis_dataset.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped'
210
211        if not get_config()['workflow'].get('dry_run', False):
212            with out_path.open('w') as fp:
213                df.to_csv(fp, sep='\t', index=False, header=False)
214        return out_path

Create a PED file for all samples in the whole MultiCohort Duplication of the Cohort method PED is written with no header line to be strict specification compliant