cpg_flow.targets.dataset

This module defines the Dataset class, which is part of the cpg-flow system for managing genomic datasets.

The Dataset class allows for:

  • Managing sequencing groups: Creation, addition, and retrieval of sequencing groups that are part of the dataset.
  • Storage path management: Provides methods to access primary, temporary, analysis, and web storage paths.
  • Integration with configurations: Uses configuration settings for workflow management and path handling.
  • Pedigree file generation: Capable of generating PED files based on sequencing group data for genetic analysis.
  • Logging: Utilizes LOGGER to track and debug sequencing group additions.

Key Components:

  • Dataset: Main class that represents a genomic dataset and extends from the Target class.
  • SequencingGroup Management: Methods to handle sequencing groups related to the dataset.
  • Path Handling: Methods to derive and handle various storage paths.
  • Configurations: Integration with external configuration settings for flexible dataset handling.

This module is essential for organizing and managing data resources in CPG-related projects.

  1"""
  2This module defines the `Dataset` class, which is part of the cpg-flow system for managing genomic datasets.
  3
  4The `Dataset` class allows for:
  5- Managing sequencing groups: Creation, addition, and retrieval of sequencing groups that are part of the dataset.
  6- Storage path management: Provides methods to access primary, temporary, analysis, and web storage paths.
  7- Integration with configurations: Uses configuration settings for workflow management and path handling.
  8- Pedigree file generation: Capable of generating PED files based on sequencing group data for genetic analysis.
  9- Logging: Utilizes LOGGER to track and debug sequencing group additions.
 10
 11Key Components:
 12- Dataset: Main class that represents a genomic dataset and extends from the `Target` class.
 13- SequencingGroup Management: Methods to handle sequencing groups related to the dataset.
 14- Path Handling: Methods to derive and handle various storage paths.
 15- Configurations: Integration with external configuration settings for flexible dataset handling.
 16
 17This module is essential for organizing and managing data resources in CPG-related projects.
 18
 19"""
 20
 21from typing import TYPE_CHECKING, Optional
 22
 23import pandas as pd
 24
 25from cpg_flow.filetypes import AlignmentInput
 26from cpg_flow.targets import SequencingGroup, Target, seq_type_subdir
 27from cpg_flow.utils import get_logger
 28from cpg_utils import Path, to_path
 29from cpg_utils.config import dataset_path, get_config, web_url
 30
 31LOGGER = get_logger(__name__)
 32
 33if TYPE_CHECKING:
 34    from cpg_flow.targets import PedigreeInfo, Sex
 35
 36
 37class Dataset(Target):
 38    """
 39    Represents a CPG dataset.
 40
 41    Each `dataset` at the CPG corresponds to
 42    * a GCP project: https://github.com/populationgenomics/team-docs/tree/main/storage_policies
 43    * a Pulumi stack: https://github.com/populationgenomics/analysis-runner/tree/main/stack
 44    * a metamist project
 45    """
 46
 47    def __init__(
 48        self,
 49        name: str,
 50    ):
 51        super().__init__()
 52        self._sequencing_group_by_id: dict[str, SequencingGroup] = {}
 53        self.name = name
 54        self.active = True
 55
 56    @staticmethod
 57    def create(name: str) -> 'Dataset':
 58        """
 59        Create a dataset.
 60        """
 61        return Dataset(name=name)
 62
 63    @property
 64    def target_id(self) -> str:
 65        """Unique target ID"""
 66        return self.name
 67
 68    def __repr__(self):
 69        return f'Dataset("{self.name}", {len(self.get_sequencing_groups())} sequencing groups)'
 70
 71    def __str__(self):
 72        return f'{self.name} ({len(self.get_sequencing_groups())} sequencing groups)'
 73
 74    def prefix(self, **kwargs) -> Path:
 75        """
 76        The primary storage path.
 77        """
 78        return to_path(
 79            dataset_path(
 80                seq_type_subdir(),
 81                dataset=self.name,
 82                **kwargs,
 83            ),
 84        )
 85
 86    def tmp_prefix(self, **kwargs) -> Path:
 87        """
 88        Storage path for temporary files.
 89        """
 90        return to_path(
 91            dataset_path(
 92                seq_type_subdir(),
 93                dataset=self.name,
 94                category='tmp',
 95                **kwargs,
 96            ),
 97        )
 98
 99    def analysis_prefix(self, **kwargs) -> Path:
100        """
101        Storage path for analysis files.
102        """
103        return to_path(
104            dataset_path(
105                seq_type_subdir(),
106                dataset=self.name,
107                category='analysis',
108                **kwargs,
109            ),
110        )
111
112    def web_prefix(self, **kwargs) -> Path:
113        """
114        Path for files served by an HTTP server Matches corresponding URLs returns by
115        self.web_url() URLs.
116        """
117        return to_path(
118            dataset_path(
119                seq_type_subdir(),
120                dataset=self.name,
121                category='web',
122                **kwargs,
123            ),
124        )
125
126    def web_url(self) -> str | None:
127        """
128        URLs matching self.storage_web_path() files serverd by an HTTP server.
129        """
130        return web_url(
131            seq_type_subdir(),
132            dataset=self.name,
133        )
134
135    def add_sequencing_group(
136        self,
137        id: str,  # pylint: disable=redefined-builtin
138        *,
139        sequencing_type: str,
140        sequencing_technology: str,
141        sequencing_platform: str,
142        external_id: str | None = None,
143        participant_id: str | None = None,
144        meta: dict | None = None,
145        sex: Optional['Sex'] = None,
146        pedigree: Optional['PedigreeInfo'] = None,
147        alignment_input: AlignmentInput | None = None,
148    ) -> 'SequencingGroup':
149        """
150        Create a new sequencing group and add it to the dataset.
151        """
152        if id in self._sequencing_group_by_id:
153            LOGGER.debug(
154                f'SequencingGroup {id} already exists in the dataset {self.name}',
155            )
156            return self._sequencing_group_by_id[id]
157
158        force_sgs = get_config()['workflow'].get('force_sgs', set())
159        forced = id in force_sgs or external_id in force_sgs or participant_id in force_sgs
160
161        s = SequencingGroup(
162            id=id,
163            dataset=self,
164            external_id=external_id,
165            sequencing_type=sequencing_type,
166            sequencing_technology=sequencing_technology,
167            sequencing_platform=sequencing_platform,
168            participant_id=participant_id,
169            meta=meta,
170            sex=sex,
171            pedigree=pedigree,
172            alignment_input=alignment_input,
173            forced=forced,
174        )
175        self._sequencing_group_by_id[id] = s
176        return s
177
178    def add_sequencing_group_object(self, s: 'SequencingGroup'):
179        """
180        Add a sequencing group object to the dataset.
181        Args:
182            s: SequencingGroup object
183        """
184        if s.id in self._sequencing_group_by_id:
185            LOGGER.debug(
186                f'SequencingGroup {s.id} already exists in the dataset {self.name}',
187            )
188        else:
189            self._sequencing_group_by_id[s.id] = s
190
191    def get_sequencing_group_by_id(self, id: str) -> Optional['SequencingGroup']:
192        """
193        Get sequencing group by ID
194        """
195        return self._sequencing_group_by_id.get(id)
196
197    def get_sequencing_groups(
198        self,
199        only_active: bool = True,
200    ) -> list['SequencingGroup']:
201        """
202        Get dataset's sequencing groups. Include only "active" sequencing groups, unless only_active=False
203        """
204        return [s for sid, s in self._sequencing_group_by_id.items() if (s.active or not only_active)]
205
206    def get_job_attrs(self) -> dict:
207        """
208        Attributes for Hail Batch job.
209        """
210        return {
211            'dataset': self.name,
212            # 'sequencing_groups': self.get_sequencing_group_ids(),
213        }
214
215    def get_job_prefix(self) -> str:
216        """
217        Prefix job names.
218        """
219        return f'{self.name}: '
220
221    def write_ped_file(
222        self,
223        out_path: Path | None = None,
224        use_participant_id: bool = False,
225    ) -> Path:
226        """
227        Create a PED file for all sequencing groups
228        PED is written with no header line to be strict specification compliant
229        """
230        datas = []
231        for sequencing_group in self.get_sequencing_groups():
232            datas.append(
233                sequencing_group.pedigree.get_ped_dict(
234                    use_participant_id=use_participant_id,
235                ),
236            )
237        if not datas:
238            raise ValueError(f'No pedigree data found for {self.name}')
239        df = pd.DataFrame(datas)
240
241        if out_path is None:
242            out_path = self.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped'
243
244        if not get_config()['workflow'].get('dry_run', False):
245            with out_path.open('w') as fp:
246                df.to_csv(fp, sep='\t', index=False, header=False)
247        return out_path
LOGGER = <Logger cpg_flow.targets.dataset (INFO)>
class Dataset(cpg_flow.targets.target.Target):
 38class Dataset(Target):
 39    """
 40    Represents a CPG dataset.
 41
 42    Each `dataset` at the CPG corresponds to
 43    * a GCP project: https://github.com/populationgenomics/team-docs/tree/main/storage_policies
 44    * a Pulumi stack: https://github.com/populationgenomics/analysis-runner/tree/main/stack
 45    * a metamist project
 46    """
 47
 48    def __init__(
 49        self,
 50        name: str,
 51    ):
 52        super().__init__()
 53        self._sequencing_group_by_id: dict[str, SequencingGroup] = {}
 54        self.name = name
 55        self.active = True
 56
 57    @staticmethod
 58    def create(name: str) -> 'Dataset':
 59        """
 60        Create a dataset.
 61        """
 62        return Dataset(name=name)
 63
 64    @property
 65    def target_id(self) -> str:
 66        """Unique target ID"""
 67        return self.name
 68
 69    def __repr__(self):
 70        return f'Dataset("{self.name}", {len(self.get_sequencing_groups())} sequencing groups)'
 71
 72    def __str__(self):
 73        return f'{self.name} ({len(self.get_sequencing_groups())} sequencing groups)'
 74
 75    def prefix(self, **kwargs) -> Path:
 76        """
 77        The primary storage path.
 78        """
 79        return to_path(
 80            dataset_path(
 81                seq_type_subdir(),
 82                dataset=self.name,
 83                **kwargs,
 84            ),
 85        )
 86
 87    def tmp_prefix(self, **kwargs) -> Path:
 88        """
 89        Storage path for temporary files.
 90        """
 91        return to_path(
 92            dataset_path(
 93                seq_type_subdir(),
 94                dataset=self.name,
 95                category='tmp',
 96                **kwargs,
 97            ),
 98        )
 99
100    def analysis_prefix(self, **kwargs) -> Path:
101        """
102        Storage path for analysis files.
103        """
104        return to_path(
105            dataset_path(
106                seq_type_subdir(),
107                dataset=self.name,
108                category='analysis',
109                **kwargs,
110            ),
111        )
112
113    def web_prefix(self, **kwargs) -> Path:
114        """
115        Path for files served by an HTTP server Matches corresponding URLs returns by
116        self.web_url() URLs.
117        """
118        return to_path(
119            dataset_path(
120                seq_type_subdir(),
121                dataset=self.name,
122                category='web',
123                **kwargs,
124            ),
125        )
126
127    def web_url(self) -> str | None:
128        """
129        URLs matching self.storage_web_path() files serverd by an HTTP server.
130        """
131        return web_url(
132            seq_type_subdir(),
133            dataset=self.name,
134        )
135
136    def add_sequencing_group(
137        self,
138        id: str,  # pylint: disable=redefined-builtin
139        *,
140        sequencing_type: str,
141        sequencing_technology: str,
142        sequencing_platform: str,
143        external_id: str | None = None,
144        participant_id: str | None = None,
145        meta: dict | None = None,
146        sex: Optional['Sex'] = None,
147        pedigree: Optional['PedigreeInfo'] = None,
148        alignment_input: AlignmentInput | None = None,
149    ) -> 'SequencingGroup':
150        """
151        Create a new sequencing group and add it to the dataset.
152        """
153        if id in self._sequencing_group_by_id:
154            LOGGER.debug(
155                f'SequencingGroup {id} already exists in the dataset {self.name}',
156            )
157            return self._sequencing_group_by_id[id]
158
159        force_sgs = get_config()['workflow'].get('force_sgs', set())
160        forced = id in force_sgs or external_id in force_sgs or participant_id in force_sgs
161
162        s = SequencingGroup(
163            id=id,
164            dataset=self,
165            external_id=external_id,
166            sequencing_type=sequencing_type,
167            sequencing_technology=sequencing_technology,
168            sequencing_platform=sequencing_platform,
169            participant_id=participant_id,
170            meta=meta,
171            sex=sex,
172            pedigree=pedigree,
173            alignment_input=alignment_input,
174            forced=forced,
175        )
176        self._sequencing_group_by_id[id] = s
177        return s
178
179    def add_sequencing_group_object(self, s: 'SequencingGroup'):
180        """
181        Add a sequencing group object to the dataset.
182        Args:
183            s: SequencingGroup object
184        """
185        if s.id in self._sequencing_group_by_id:
186            LOGGER.debug(
187                f'SequencingGroup {s.id} already exists in the dataset {self.name}',
188            )
189        else:
190            self._sequencing_group_by_id[s.id] = s
191
192    def get_sequencing_group_by_id(self, id: str) -> Optional['SequencingGroup']:
193        """
194        Get sequencing group by ID
195        """
196        return self._sequencing_group_by_id.get(id)
197
198    def get_sequencing_groups(
199        self,
200        only_active: bool = True,
201    ) -> list['SequencingGroup']:
202        """
203        Get dataset's sequencing groups. Include only "active" sequencing groups, unless only_active=False
204        """
205        return [s for sid, s in self._sequencing_group_by_id.items() if (s.active or not only_active)]
206
207    def get_job_attrs(self) -> dict:
208        """
209        Attributes for Hail Batch job.
210        """
211        return {
212            'dataset': self.name,
213            # 'sequencing_groups': self.get_sequencing_group_ids(),
214        }
215
216    def get_job_prefix(self) -> str:
217        """
218        Prefix job names.
219        """
220        return f'{self.name}: '
221
222    def write_ped_file(
223        self,
224        out_path: Path | None = None,
225        use_participant_id: bool = False,
226    ) -> Path:
227        """
228        Create a PED file for all sequencing groups
229        PED is written with no header line to be strict specification compliant
230        """
231        datas = []
232        for sequencing_group in self.get_sequencing_groups():
233            datas.append(
234                sequencing_group.pedigree.get_ped_dict(
235                    use_participant_id=use_participant_id,
236                ),
237            )
238        if not datas:
239            raise ValueError(f'No pedigree data found for {self.name}')
240        df = pd.DataFrame(datas)
241
242        if out_path is None:
243            out_path = self.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped'
244
245        if not get_config()['workflow'].get('dry_run', False):
246            with out_path.open('w') as fp:
247                df.to_csv(fp, sep='\t', index=False, header=False)
248        return out_path

Represents a CPG dataset.

Each dataset at the CPG corresponds to

Dataset(name: str)
48    def __init__(
49        self,
50        name: str,
51    ):
52        super().__init__()
53        self._sequencing_group_by_id: dict[str, SequencingGroup] = {}
54        self.name = name
55        self.active = True
name
active
@staticmethod
def create(name: str) -> Dataset:
57    @staticmethod
58    def create(name: str) -> 'Dataset':
59        """
60        Create a dataset.
61        """
62        return Dataset(name=name)

Create a dataset.

target_id: str
64    @property
65    def target_id(self) -> str:
66        """Unique target ID"""
67        return self.name

Unique target ID

def prefix(self, **kwargs) -> cloudpathlib.cloudpath.CloudPath | pathlib.Path:
75    def prefix(self, **kwargs) -> Path:
76        """
77        The primary storage path.
78        """
79        return to_path(
80            dataset_path(
81                seq_type_subdir(),
82                dataset=self.name,
83                **kwargs,
84            ),
85        )

The primary storage path.

def tmp_prefix(self, **kwargs) -> cloudpathlib.cloudpath.CloudPath | pathlib.Path:
87    def tmp_prefix(self, **kwargs) -> Path:
88        """
89        Storage path for temporary files.
90        """
91        return to_path(
92            dataset_path(
93                seq_type_subdir(),
94                dataset=self.name,
95                category='tmp',
96                **kwargs,
97            ),
98        )

Storage path for temporary files.

def analysis_prefix(self, **kwargs) -> cloudpathlib.cloudpath.CloudPath | pathlib.Path:
100    def analysis_prefix(self, **kwargs) -> Path:
101        """
102        Storage path for analysis files.
103        """
104        return to_path(
105            dataset_path(
106                seq_type_subdir(),
107                dataset=self.name,
108                category='analysis',
109                **kwargs,
110            ),
111        )

Storage path for analysis files.

def web_prefix(self, **kwargs) -> cloudpathlib.cloudpath.CloudPath | pathlib.Path:
113    def web_prefix(self, **kwargs) -> Path:
114        """
115        Path for files served by an HTTP server Matches corresponding URLs returns by
116        self.web_url() URLs.
117        """
118        return to_path(
119            dataset_path(
120                seq_type_subdir(),
121                dataset=self.name,
122                category='web',
123                **kwargs,
124            ),
125        )

Path for files served by an HTTP server Matches corresponding URLs returns by self.web_url() URLs.

def web_url(self) -> str | None:
127    def web_url(self) -> str | None:
128        """
129        URLs matching self.storage_web_path() files serverd by an HTTP server.
130        """
131        return web_url(
132            seq_type_subdir(),
133            dataset=self.name,
134        )

URLs matching self.storage_web_path() files serverd by an HTTP server.

def add_sequencing_group( self, id: str, *, sequencing_type: str, sequencing_technology: str, sequencing_platform: str, external_id: str | None = None, participant_id: str | None = None, meta: dict | None = None, sex: Optional[cpg_flow.targets.types.Sex] = None, pedigree: Optional[cpg_flow.targets.pedigree_info.PedigreeInfo] = None, alignment_input: cpg_flow.filetypes.AlignmentInput | None = None) -> cpg_flow.targets.sequencing_group.SequencingGroup:
136    def add_sequencing_group(
137        self,
138        id: str,  # pylint: disable=redefined-builtin
139        *,
140        sequencing_type: str,
141        sequencing_technology: str,
142        sequencing_platform: str,
143        external_id: str | None = None,
144        participant_id: str | None = None,
145        meta: dict | None = None,
146        sex: Optional['Sex'] = None,
147        pedigree: Optional['PedigreeInfo'] = None,
148        alignment_input: AlignmentInput | None = None,
149    ) -> 'SequencingGroup':
150        """
151        Create a new sequencing group and add it to the dataset.
152        """
153        if id in self._sequencing_group_by_id:
154            LOGGER.debug(
155                f'SequencingGroup {id} already exists in the dataset {self.name}',
156            )
157            return self._sequencing_group_by_id[id]
158
159        force_sgs = get_config()['workflow'].get('force_sgs', set())
160        forced = id in force_sgs or external_id in force_sgs or participant_id in force_sgs
161
162        s = SequencingGroup(
163            id=id,
164            dataset=self,
165            external_id=external_id,
166            sequencing_type=sequencing_type,
167            sequencing_technology=sequencing_technology,
168            sequencing_platform=sequencing_platform,
169            participant_id=participant_id,
170            meta=meta,
171            sex=sex,
172            pedigree=pedigree,
173            alignment_input=alignment_input,
174            forced=forced,
175        )
176        self._sequencing_group_by_id[id] = s
177        return s

Create a new sequencing group and add it to the dataset.

def add_sequencing_group_object(self, s: cpg_flow.targets.sequencing_group.SequencingGroup):
179    def add_sequencing_group_object(self, s: 'SequencingGroup'):
180        """
181        Add a sequencing group object to the dataset.
182        Args:
183            s: SequencingGroup object
184        """
185        if s.id in self._sequencing_group_by_id:
186            LOGGER.debug(
187                f'SequencingGroup {s.id} already exists in the dataset {self.name}',
188            )
189        else:
190            self._sequencing_group_by_id[s.id] = s

Add a sequencing group object to the dataset. Args: s: SequencingGroup object

def get_sequencing_group_by_id( self, id: str) -> Optional[cpg_flow.targets.sequencing_group.SequencingGroup]:
192    def get_sequencing_group_by_id(self, id: str) -> Optional['SequencingGroup']:
193        """
194        Get sequencing group by ID
195        """
196        return self._sequencing_group_by_id.get(id)

Get sequencing group by ID

def get_sequencing_groups( self, only_active: bool = True) -> list[cpg_flow.targets.sequencing_group.SequencingGroup]:
198    def get_sequencing_groups(
199        self,
200        only_active: bool = True,
201    ) -> list['SequencingGroup']:
202        """
203        Get dataset's sequencing groups. Include only "active" sequencing groups, unless only_active=False
204        """
205        return [s for sid, s in self._sequencing_group_by_id.items() if (s.active or not only_active)]

Get dataset's sequencing groups. Include only "active" sequencing groups, unless only_active=False

def get_job_attrs(self) -> dict:
207    def get_job_attrs(self) -> dict:
208        """
209        Attributes for Hail Batch job.
210        """
211        return {
212            'dataset': self.name,
213            # 'sequencing_groups': self.get_sequencing_group_ids(),
214        }

Attributes for Hail Batch job.

def get_job_prefix(self) -> str:
216    def get_job_prefix(self) -> str:
217        """
218        Prefix job names.
219        """
220        return f'{self.name}: '

Prefix job names.

def write_ped_file( self, out_path: cloudpathlib.cloudpath.CloudPath | pathlib.Path | None = None, use_participant_id: bool = False) -> cloudpathlib.cloudpath.CloudPath | pathlib.Path:
222    def write_ped_file(
223        self,
224        out_path: Path | None = None,
225        use_participant_id: bool = False,
226    ) -> Path:
227        """
228        Create a PED file for all sequencing groups
229        PED is written with no header line to be strict specification compliant
230        """
231        datas = []
232        for sequencing_group in self.get_sequencing_groups():
233            datas.append(
234                sequencing_group.pedigree.get_ped_dict(
235                    use_participant_id=use_participant_id,
236                ),
237            )
238        if not datas:
239            raise ValueError(f'No pedigree data found for {self.name}')
240        df = pd.DataFrame(datas)
241
242        if out_path is None:
243            out_path = self.tmp_prefix() / 'ped' / f'{self.get_alignment_inputs_hash()}.ped'
244
245        if not get_config()['workflow'].get('dry_run', False):
246            with out_path.open('w') as fp:
247                df.to_csv(fp, sep='\t', index=False, header=False)
248        return out_path

Create a PED file for all sequencing groups PED is written with no header line to be strict specification compliant