diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cbb9010 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.aux +*.log +*.out +*.synctex.gz \ No newline at end of file diff --git a/LICENSE b/LICENSE index c5fffc0..5a77245 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2025 admin +Copyright (c) 2025 Ziao Wang Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: diff --git a/sample.py b/sample.py new file mode 100644 index 0000000..8e10891 --- /dev/null +++ b/sample.py @@ -0,0 +1,27 @@ +import numpy as np + +N = 100000 +d = 1000 +Xs = np.random.normal(size=(N,d)) + +# We demonstrate that X0 is optimal. Find vector k. +X0 = Xs[0] +X0p = X0.clip(min=0) +L2_X0p = np.linalg.norm(X0p) +X0p_hat = X0p / L2_X0p +X0_score = np.dot(X0p_hat, X0) # is also L2_X0p +print("X0_score: %.4f" % X0_score) + +# Find scores +scores = Xs @ X0p_hat +scores = np.partition(scores, -2) +print("1st score: %.4f, squared = %.4f ~ d/2 = %.2f" % (scores[-1], scores[-1]**2, d/2)) +print("2nd score: %.4f, squared = %.4f < bd, bd ~ 2log(n) = %.4f" % (scores[-2], scores[-2]**2, 2*np.log(N))) + +print("If we allow other candidates to remove their negative scores") +Xs = Xs.clip(min=0) +scores = Xs @ X0p_hat +scores = np.partition(scores, -2) +dpart = np.sqrt(d/2)/np.pi +npart = np.sqrt((1-1/np.pi)*np.log(N)) +print("2nd score: %.4f ~ sqrt{d/2pi^2} + sqrt{(1-1/pi)ln(n)} = %.4f" % (scores[-2], dpart+npart)) diff --git a/writeup/high-dim-optimal.pdf b/writeup/high-dim-optimal.pdf new file mode 100644 index 0000000..495ea89 Binary files /dev/null and b/writeup/high-dim-optimal.pdf differ diff --git a/writeup/high-dim-optimal.tex b/writeup/high-dim-optimal.tex new file mode 100644 index 0000000..e0ceb58 --- /dev/null +++ b/writeup/high-dim-optimal.tex @@ -0,0 +1,156 @@ +\documentclass{article} +\title{Probability of points being ``positive linear optimal'' in high dimensionality} +\author{Ziao Wang} +\date{} +\usepackage{indentfirst} +\usepackage{amsmath,amssymb} +\usepackage[margin=1in]{geometry} +\usepackage{hyperref} +\usepackage{graphicx} +\graphicspath{.} +\usepackage{listings} + % Optional: Global settings for Python listings + \lstset{ + language=Python, + basicstyle=\ttfamily\small, % Monospaced font, small size + showstringspaces=false, % Don't show spaces in strings + frame=single, % Add a frame around the listing + breaklines=true % Allow line breaking + } + +\begin{document} +\maketitle + +\section*{1. Background} + +\section*{Lemma 1} +If $X$ follows a d-dimensional standard Gaussian distribution, and $v$ is a fixed unit vector, then $X \cdot d \sim N(0,1)$. + +Justification: the d-dimensional standard Gaussian distribution is symmetric on any direction, like a ball; or, use the property that any linear combination of a multivariate normal vector is itself a univariate normal random variable. + +\section*{Lemma 2} +When $n >> 0$, the maximum value among n standard Gaussians is very unlikely to exceed $\sqrt{2\ln{n}}$. + +Relying on Lemma 5.3 in \href{https://psdey.web.illinois.edu/math595fa19/lec05.pdf}{``Concentration for Maximum'' from a UCLA lecture by Partha S. Dey (scribe: Hsin-Po Wang)} to supports this claim. Copying Lemma 5.3 here: + +Suppose $M_n$ is the maximum of n i.i.d. standard normals, then + +$$ P[M_n \geq \sqrt{2\log{n}} + \frac{x}{\sqrt{2\log{n}}}] \leq e^{-x} $$ + +\section*{Lemma 3} + +If x is standard Gaussian, its mean value given that $x>0$ is $\sqrt{\frac{2}{\pi}}$. That is, $x \sim N(0,1) \rightarrow E[x|x>0] = \sqrt{\frac{2}{\pi}}$. + +Derivation: $E[x|x>0] = \sqrt{\frac{2}{\pi}} \int_{x=0}^\infty xe^{-x^2/2}dx = \sqrt{\frac{2}{\pi}} \int_{u=0}^\infty e^{-u} du = \sqrt{\frac{2}{\pi}}$ + +\section*{2. Problem} + +Let$X_1,...,X_n$ be i.i.d. samples from a d-dimensional standard Gaussian. We say sample $X_i$ is ``positive linear optimal'' (PLO) if there exists a d-dimensional vector k whose elements are non-negative such that $X_i \cdot k > X_j \cdot k, \forall j\neq i$. Suppose $n >> d >> \ln{n}$. As $n\rightarrow \infty$, is every point almost always, for some probability, or almost never positive linear optimal? + +\section*{2.1 Lower bound} +If $X_i$ is greatest in any dimension r, $X_i$ is definitely for PLO - just choose $k=e_r$. The probability of being greatest in any dimension is $\frac{1}{n}$, so the probability of being greatest in any dimension is $1-(1-\frac{1}{n})^d \sim \frac{d}{n} \rightarrow 0$, a lower bound too loose. + +\section*{2.2 Upper bound} +If $X_i$ is PLO, then $X_i$ must be Pareto optimal: $\forall j$, we must not have $X_j > X_i$ on every dimension (i.e. $X_i$ is dominated by $X_j$). The probability of being PLO is upper bounded by being Pareto optimal. The latter can be estimated as follows: the probability of $X_i$ being dominated by $X_j$ is the product of every dimension, $\frac{1}{2^d}$. Assume that $X_j > X_i$ is independent from $x_m > X_i$. Then the probability of $X_i$ being Pareto optimal is about $1-\frac{n}{2^d} \rightarrow 1$, also too loose. + + +\section*{3. Construction} + +Claim: Every sample is almost always PLO. + +Construction: For sample $X_i$, we choose k to be $\frac{X_i^{+}}{||X_i^{+}||}$, where $X_i^{+}$ is the vector formed by taking the components of $X_i$ that are positive and setting the non-positive components to 0. We see that k is a unit vector. + +\section*{3.1. The Candidate's score} + +We know that every component of $X_i^+$ follows the ``clipped'' standard Gaussian, where the probability mass for negative values are clipped to 0 (which has 50\% chance of happening). Therefore the ``score'' $S$ (we use it to refer to the dot product of a sample with k) of $X_i$ is +\begin{align*} +S &= X_i \cdot k = X_i^{+} \cdot k = X_i^{+} \cdot \frac{X_i^+}{||X_i^+||} = ||X_i^+|| \\ +S^2 &= X_i^+ \cdot X_i^+ = \Sigma_{r=1}^d (X_{i,r}^+)^2 \sim d/2 +\end{align*} + +(Use the law of large numbers here) + +\section*{3.2 The Competitor's score} + +$X_i$ must compete against the rest of the samples. For $X_j, j\neq i$, since $X_j$ is independent to $k$ and $k$ is a unit vector, we apply Lemma 1 and see $X_j \cdot k \sim N(0,1)$. + +Among n-1 competitors, the strongest competitor's score $T$ must not exceed $\sqrt{2\ln{n}}$ a lot, according to Lemma 2. So $T^2 \sim 2\ln{n}$. + +\section*{3.3 Comparison} + +Since $d >> \ln{n}$ in the problem statement, we have $S^2 >> T^2$. Therefore every point is almost always PLO. + +\section*{3.4 Variant: Allow competitors to exclude negative scores} + +Consider what happens if competitors are allowed to exclude negative components from the dot product. Or equivalently, $X_1^+,...,X_n^+$ is sampled from a clipped d-dimensional standard Gaussian. The candidate's score wouldn't change; will the competitors' scores increase so much as to surpass the candidate? For each competitor j, its score $ T_j = X_j^+ \cdot k = \frac{1}{||X_i^+||} \Sigma_{r=0}^d X_{jr}^+ \cdot X_{ir}^+ $ + +Apply Lemma 3, we see that $E[X_{jr}^+] = \frac{1}{2}*0 + \frac{1}{2}E[x|x>0,x\sim N(0,1)] = \sqrt{\frac{1}{2\pi}}$. We also know that $E[(X_{jr}^+)^2] = \frac{1}{2}$, thus $Var[X_{jr}^+] = (\frac{1}{2}-\frac{1}{2\pi})$. + +On the first order, + +$$ E[||X_i^+||*T_j] = \sqrt{\frac{1}{2\pi}} \Sigma_{r=1}^d X_{ir}^+ := \sqrt{\frac{1}{2\pi}} s_1$$ + +For the variance, + +$$ Var[||X_i^+||*T_j] = \Sigma_{r=1}^d (\frac{1}{2}-\frac{1}{2\pi}) * (X_{ir}^+)^2 := (\frac{1}{2}-\frac{1}{2\pi}) s_2$$ + +Since $T_j$ is the sum of many independent distributions, apply the law of large numbers on the component dimension (assuming $d >> 1$), $T_j$ is approximately a normal distribution: + +$$ ||X_i^+||*T_j \sim N(\sqrt{\frac{1}{2\pi}} s_1,\ (\frac{1}{2}-\frac{1}{2\pi}) s_2 ) := N(\mu, \sigma^2) $$ + +Apply Lemma 2 here, among n competitors + +$$ ||X_i^+||*T_{max} \sim \mu + \sqrt{2\ln{n}}*\sigma $$ + +Recall that $||X_i^+|| = \sqrt{s_2}$ + +$$ T_{max} \sim \sqrt{\frac{1}{2\pi}} \frac{s_1}{\sqrt{s_2}} + \sqrt{(1-\frac{1}{\pi})\ln{n}} $$ + +Considering that $s_1 \sim d\sqrt{\frac{1}{2\pi}}$, $s_2 \sim \frac{d}{2}$, + +$$ T_{max} \sim \frac{\sqrt{d/2}}{\pi} + \sqrt{(1-\frac{1}{\pi})\ln{n}} $$ + +When $d >> \log{n}$, $T_{max} \sim \frac{1}{\pi} \sqrt{d/2}$ while $S \sim \sqrt{d/2}$. The strongest competitor is no longer order of magnitudes smaller than S (thanks to the exclusion of negative scores), but instead a constant fraction (specifically, $\frac{1}{\pi}$) of S. That said, the conclusion doesn't change - $X_i$ almost always wins. + +\section*{4. Numerical Sanity Check} + +\begin{lstlisting} +import numpy as np + +N = 100000 +d = 1000 +Xs = np.random.normal(size=(N,d)) + +# Find the metric vector +X0 = Xs[0] # We demonstrate that X0 is optimal +X0p = X0.clip(min=0) +L2_X0p = np.linalg.norm(X0p) +X0p_hat = X0p / L2_X0p +X0_score = np.dot(X0p_hat, X0) # is also L2_X0p +print("X0_score: %.4f" % X0_score) + +# Find scores +scores = Xs @ X0p_hat +scores = np.partition(scores, -2) +print("1st score: %.4f, squared = %.4f ~ d/2 = %.2f" % (scores[-1], scores[-1]**2, d/2)) +print("2nd score: %.4f, squared = %.4f < bd, bd ~ 2log(n) = %.4f" % (scores[-2], scores[-2]**2, 2*np.log(N))) + +print("If we allow other candidates to remove their negative scores") +Xs = Xs.clip(min=0) +scores = Xs @ X0p_hat +scores = np.partition(scores, -2) +dpart = np.sqrt(d/2)/np.pi +npart = np.sqrt((1-1/np.pi)*np.log(N)) +print("2nd score: %.4f ~ sqrt{d/2pi^2} + sqrt{(1-1/pi)ln(n)} = %.4f" % (scores[-2], dpart+npart)) +\end{lstlisting} + +Example Result: +\begin{lstlisting} +X0_score: 21.2264 +1st score: 21.2264, squared = 450.5582 ~ d/2 = 500.00 +2nd score: 4.3707, squared = 19.1034 < bd, bd ~ 2log(n) = 23.0259 +If we allow other candidates to remove their negative scores +2nd score: 9.6617 ~ sqrt{d/2pi^2} + sqrt{(1-1/pi)ln(n)} = 9.9191 +\end{lstlisting} + +\end{document} \ No newline at end of file