hma model gives one value in every column #2145
Labels
bug
Something isn't working
data:multi-table
Related to multi-table, relational datasets
under discussion
Issue is currently being discussed
Environment details
If you are already running SDV, please indicate the following details about the environment in
which you are running it:
Problem description
<Replace this with a description of the problem that you are trying to solve using SDV. If
possible, describe the data that you are using, or consider attaching some example data
that others can use to propose a working solution for your problem.>
What I already tried
<Replace with a description of what you already tried and what is the behavior that you observe.
If possible, also add below the exact code that you are running.>
Hey.
I use the hma model and in the father table I get high marks except for the entry_date column. I realized from a comparison between the tables that in the synthetic data I get one value in the whole table, unlike the normal table where I have many different dates.
When I try to do a comparison I get an error:
LinAlgError Traceback (most recent call last)
File ~\anaconda3\lib\site-packages\scipy\stats_kde.py:226, in gaussian_kde.init(self, dataset, bw_method, weights)
225 try:
--> 226 self.set_bandwidth(bw_method=bw_method)
227 except linalg.LinAlgError as e:
File ~\anaconda3\lib\site-packages\scipy\stats_kde.py:574, in gaussian_kde.set_bandwidth(self, bw_method)
572 raise ValueError(msg)
--> 574 self._compute_covariance()
File ~\anaconda3\lib\site-packages\scipy\stats_kde.py:586, in gaussian_kde._compute_covariance(self)
583 self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
584 bias=False,
585 aweights=self.weights))
--> 586 self._data_cho_cov = linalg.cholesky(self._data_covariance,
587 lower=True)
589 self.covariance = self._data_covariance * self.factor**2
File ~\anaconda3\lib\site-packages\scipy\linalg_decomp_cholesky.py:88, in cholesky(a, lower, overwrite_a, check_finite)
45 """
46 Compute the Cholesky decomposition of a matrix.
47
(...)
86
87 """
---> 88 c, lower = _cholesky(a, lower=lower, overwrite_a=overwrite_a, clean=True,
89 check_finite=check_finite)
90 return c
File ~\anaconda3\lib\site-packages\scipy\linalg_decomp_cholesky.py:36, in _cholesky(a, lower, overwrite_a, clean, check_finite)
35 if info > 0:
---> 36 raise LinAlgError("%d-th leading minor of the array is not positive "
37 "definite" % info)
38 if info < 0:
LinAlgError: 1-th leading minor of the array is not positive definite
The above exception was the direct cause of the following exception:
LinAlgError Traceback (most recent call last)
Cell In[2002], line 3
1 from sdv.evaluation.multi_table import get_column_plot
----> 3 fig = get_column_plot(
4 real_data=cleaned_data,
5 synthetic_data=synthetic_data,
6 column_name='ENTRY_DATE',
7 table_name='visits_data',
8 metadata=metadata
9 )
11 fig.show()
File ~\anaconda3\lib\site-packages\sdv\evaluation\multi_table.py:81, in get_column_plot(real_data, synthetic_data, metadata, table_name, column_name, plot_type)
79 real_data = real_data[table_name]
80 synthetic_data = synthetic_data[table_name]
---> 81 return single_table_visualization.get_column_plot(
82 real_data,
83 synthetic_data,
84 metadata,
85 column_name,
86 plot_type,
87 )
File ~\anaconda3\lib\site-packages\sdv\evaluation\single_table.py:102, in get_column_plot(real_data, synthetic_data, metadata, column_name, plot_type)
95 real_data = pd.DataFrame({
96 column_name: pd.to_datetime(real_data[column_name], format=datetime_format)
97 })
98 synthetic_data = pd.DataFrame({
99 column_name: pd.to_datetime(synthetic_data[column_name], format=datetime_format)
100 })
--> 102 return visualization.get_column_plot(
103 real_data,
104 synthetic_data,
105 column_name,
106 plot_type=plot_type
107 )
File ~\anaconda3\lib\site-packages\sdmetrics\visualization.py:39, in set_plotly_config..wrapper(*args, **kwargs)
36 except Exception:
37 pass
---> 39 return function(*args, **kwargs)
File ~\anaconda3\lib\site-packages\sdmetrics\visualization.py:440, in get_column_plot(real_data, synthetic_data, column_name, plot_type)
437 real_column = real_data[column_name]
438 synthetic_column = synthetic_data[column_name]
--> 440 fig = _generate_column_plot(real_column, synthetic_column, plot_type)
442 return fig
File ~\anaconda3\lib\site-packages\sdmetrics\visualization.py:260, in _generate_column_plot(real_column, synthetic_column, plot_type, plot_kwargs, plot_title, x_label)
258 elif plot_type == 'distplot':
259 x_label = x_label or 'Value'
--> 260 fig = _generate_column_distplot(real_data, synthetic_data, plot_kwargs)
261 trace_args = {'fill': 'tozeroy'}
263 for i, name in enumerate(['Real', 'Synthetic']):
File ~\anaconda3\lib\site-packages\sdmetrics\visualization.py:198, in _generate_column_distplot(real_data, synthetic_data, plot_kwargs)
178 """Plot the real and synthetic data as a distplot.
179
180 Args:
(...)
190 plotly.graph_objects._figure.Figure
191 """
192 default_distplot_kwargs = {
193 'show_hist': False,
194 'show_rug': False,
195 'colors': [PlotConfig.DATACEBO_DARK, PlotConfig.DATACEBO_GREEN]
196 }
--> 198 fig = ff.create_distplot(
199 [real_data['values'], synthetic_data['values']],
200 ['Real', 'Synthetic'],
201 **{**default_distplot_kwargs, **plot_kwargs}
202 )
204 return fig
File ~\anaconda3\lib\site-packages\plotly\figure_factory_distplot.py:216, in create_distplot(hist_data, group_labels, bin_size, curve_type, colors, rug_text, histnorm, show_hist, show_curve, show_rug)
204 curve = _Distplot(
205 hist_data,
206 histnorm,
(...)
213 show_curve,
214 ).make_normal()
215 else:
--> 216 curve = _Distplot(
217 hist_data,
218 histnorm,
219 group_labels,
220 bin_size,
221 curve_type,
222 colors,
223 rug_text,
224 show_hist,
225 show_curve,
226 ).make_kde()
228 data.append(curve)
230 if show_rug:
File ~\anaconda3\lib\site-packages\plotly\figure_factory_distplot.py:361, in _Distplot.make_kde(self)
356 for index in range(self.trace_number):
357 self.curve_x[index] = [
358 self.start[index] + x * (self.end[index] - self.start[index]) / 500
359 for x in range(500)
360 ]
--> 361 self.curve_y[index] = scipy_stats.gaussian_kde(self.hist_data[index])(
362 self.curve_x[index]
363 )
365 if self.histnorm == ALTERNATIVE_HISTNORM:
366 self.curve_y[index] *= self.bin_size[index]
File ~\anaconda3\lib\site-packages\scipy\stats_kde.py:235, in gaussian_kde.init(self, dataset, bw_method, weights)
227 except linalg.LinAlgError as e:
228 msg = ("The data appears to lie in a lower-dimensional subspace "
229 "of the space in which it is expressed. This has resulted "
230 "in a singular data covariance matrix, which cannot be "
(...)
233 "analysis / dimensionality reduction and using "
234 "
gaussian_kde
with the transformed data.")--> 235 raise linalg.LinAlgError(msg) from e
LinAlgError: The data appears to lie in a lower-dimensional subspace of the space in which it is expressed. This has resulted in a singular data covariance matrix, which cannot be treated using the algorithms implemented in
gaussian_kde
. Consider performing principle component analysis / dimensionality reduction and usinggaussian_kde
with the transformed data.I would love to understand why it gives me the same value all the time in this column and also how I can solve the error.
I am also attaching the metadata file
metadata = {
"tables": {
"visits_data": {
"primary_key": "PATIENTID_VISITDATE",
"columns": {
"PATIENTID": { "sdtype": "id", "regex_format": "\d{2}-\d{2}-\d{1}[A-Z]-\d{4}-[A-Z]-\d{5}"},
"PATIENTID_VISITDATE": { "sdtype": "id", "regex_format": "\d{2}-\d{2}-\d{1}[A-Z]-\d{4}-[A-Z]-\d{5}\d{2}-\d{2}-\d{4}"},
"VISITDATE": { "sdtype": "datetime", "datetime_format": "%Y-%m-%d" },
"VISITTYPECODE": { "sdtype": "categorical" },
"NOWPREGNANT": { "sdtype": "categorical" },
"FAMILYPLANNINGID": { "sdtype": "categorical" },
"FUNCTIONALSTATUSCODE": { "sdtype": "categorical" },
"WHOSTAGE": { "sdtype": "categorical" },
"TBSTATUSCODE": { "sdtype": "categorical" },
"IPTREASONCODE": { "sdtype": "categorical" },
"ARVSTATUSCODE": { "sdtype": "categorical" },
"ARVREASONCODE": { "sdtype": "categorical" },
"ARVCODE": { "sdtype": "categorical" },
"NOTES": { "sdtype": "text" },
"STAFFID": { "sdtype": "categorical" },
"USERNUMBER": { "sdtype": "categorical" },
"THETIMESTAMP": { "sdtype": "datetime", "datetime_format": "%Y-%m-%d" },
"IMPORTFILE": { "sdtype": "categorical" },
"FACILITY": { "sdtype": "categorical" },
"ENTRY_DATE": { "sdtype": "datetime", "datetime_format": "%Y-%m-%d" }
}
},
"test_data": {
"primary_key": "ID",
"columns": {
"ID": { "sdtype": "id" },
"PATIENTID_VISITDATE": { "sdtype": "id", "regex_format": "\d{2}-\d{2}-\d{1}[A-Z]-\d{4}-[A-Z]-\d{5}\d{2}-\d{2}-\d{4}" },
"PATIENTID_VISITDATE_ID": { "sdtype": "id", "regex_format": "\d{2}-\d{2}-\d{1}[A-Z]-\d{4}-[A-Z]-\d{5}\d{2}-\d{2}-\d{4}\d+" },
"PATIENTID": { "sdtype": "id", "regex_format": "\d{2}-\d{2}-\d{1}[A-Z]-\d{4}-[A-Z]-\d{5}" },
"TESTTYPEID": { "sdtype": "categorical" },
"TESTDATE": { "sdtype": "datetime", "datetime_format": "%Y-%m-%d" },
"RESULTDATE": { "sdtype": "datetime", "datetime_format": "%Y-%m-%d" },
"RESULTNUMERIC": { "sdtype": "numerical" },
"RESULTNOTES": { "sdtype": "text" },
"IMPORTFILE": { "sdtype": "categorical" },
"FACILITY": { "sdtype": "categorical" },
"ENTRY_DATE": { "sdtype": "datetime", "datetime_format": "%Y-%m-%d" }
}
},
"medication_data": {
"primary_key": "PATIENTID_VISITDATE_DRUGTYPEID",
"columns": {
"PATIENTID_VISITDATE": { "sdtype": "id", "regex_format": "\d{2}-\d{2}-\d{1}[A-Z]-\d{4}-[A-Z]-\d{5}\d{2}-\d{2}-\d{4}" },
"PATIENTID_VISITDATE_DRUGTYPEID": { "sdtype": "id", "regex_format": "\d{2}-\d{2}-\d{1}[A-Z]-\d{4}-[A-Z]-\d{5}\d{2}-\d{2}-\d{4}_[A-Z]+" },
"PATIENTID": { "sdtype": "id", "regex_format": "\d{2}-\d{2}-\d{1}[A-Z]-\d{4}-[A-Z]-\d{5}" },
"VISITDATE": { "sdtype": "datetime", "datetime_format": "%Y-%m-%d" },
"DRUGTYPEID": { "sdtype": "categorical" },
"QUANTITYDISPENSED": { "sdtype": "numerical" },
"PERCENTADHERENCE": { "sdtype": "numerical" },
"NOTES": { "sdtype": "text" },
"IMPORTFILE": { "sdtype": "categorical" },
"FACILITY": { "sdtype": "categorical" },
"ENTRY_DATE": { "sdtype": "datetime", "datetime_format": "%Y-%m-%d" }
}
}
},
"relationships": [
{
"parent_table_name": "visits_data",
"child_table_name": "test_data",
"parent_primary_key": "PATIENTID_VISITDATE",
"child_foreign_key": "PATIENTID_VISITDATE"
},
{
"parent_table_name": "visits_data",
"child_table_name": "medication_data",
"parent_primary_key": "PATIENTID_VISITDATE",
"child_foreign_key": "PATIENTID_VISITDATE"
}
]
}
The text was updated successfully, but these errors were encountered: